In [1]:
#reading the data from csv

import pandas as pd

covid = pd.read_csv('test.csv')

In [2]:
#using the Tweets dataset for training on sentiment polarity

train_dataset = pd.read_csv('Tweets.csv')

In [3]:
#removing @ and # from the training data

def remove_at(x):
    x = str(x).replace('@', '')
    x = str(x).replace('#', '')
    return x

In [4]:
train_dataset['text'] = train_dataset['text'].apply(lambda x: remove_at(x))

In [5]:
#tokenizing using RegexTokenizer by consideringing only alphabets and numbers

import nltk

tokenizer = nltk.RegexpTokenizer('\w+')

def senti_doc(x):
    t = tokenizer.tokenize(x)
    return t 

In [6]:
doc = train_dataset.text.apply(lambda x: senti_doc(x))

In [7]:
#creating tuples of tokens and corresponding sentiments

docs = []

for i in range(0, len(train_dataset['airline_sentiment'])):
    docs.append((doc[i], train_dataset['airline_sentiment'][i]))

In [8]:
#shuffling the documents

import random

random.shuffle(docs)

In [9]:
#defining set of words that will be used for features

all_words = [word for (sentance,category) in docs for word in sentance]
top_words = nltk.FreqDist(all_words)
most_common_words = top_words.most_common(2000)
word_features = [word for (word,count) in most_common_words]

In [10]:
len(set(all_words))

18042

In [11]:
#defining features for each document using unigram features

def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [12]:
#defining feature sets for the documents

featuresets = [(document_features(d, word_features), c) for (d, c) in docs]

featuresets[0]

({'contains(to)': False,
  'contains(I)': False,
  'contains(the)': True,
  'contains(a)': True,
  'contains(for)': False,
  'contains(you)': False,
  'contains(united)': True,
  'contains(on)': False,
  'contains(and)': True,
  'contains(t)': False,
  'contains(flight)': False,
  'contains(my)': False,
  'contains(USAirways)': False,
  'contains(AmericanAir)': False,
  'contains(is)': False,
  'contains(in)': True,
  'contains(SouthwestAir)': False,
  'contains(JetBlue)': False,
  'contains(of)': True,
  'contains(it)': False,
  'contains(me)': False,
  'contains(that)': False,
  'contains(have)': False,
  'contains(your)': False,
  'contains(was)': False,
  'contains(s)': False,
  'contains(with)': False,
  'contains(at)': False,
  'contains(not)': False,
  'contains(can)': False,
  'contains(get)': False,
  'contains(co)': False,
  'contains(be)': False,
  'contains(from)': False,
  'contains(this)': False,
  'contains(http)': False,
  'contains(no)': False,
  'contains(but)': False

In [13]:
len(featuresets)

14640

In [14]:
#Naive Bayes classifier with 3-fold cross validation for training on sentiments using unigrams

import numpy as np
from sklearn.model_selection import KFold

kf = KFold(n_splits = 3)
sum = 0

for train, test in kf.split(featuresets):
    train_data = np.array(featuresets)[train]
    test_data = np.array(featuresets)[test]
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    sum += nltk.classify.accuracy(classifier, test_data)

acc1 = sum/3

In [15]:
#accuracy of classifier1

acc1

0.7669398907103826

In [16]:
classifier.classify(document_features(nltk.word_tokenize("best want"), word_features))

'positive'

In [17]:
#creating bigram features

all_bigrams = list(nltk.bigrams(all_words))

from nltk.collocations import *
import re

stopwords = nltk.corpus.stopwords.words('english')

def alpha(w):
    pattern = re.compile('^[^a-z]+$')
    if(pattern.match(w)):
        return True
    else:
        return False

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words)
finder.apply_word_filter(alpha)
finder.apply_word_filter(lambda w: w in stopwords)
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored[:10]

[(('customer', 'service'), 0.0019616017390139137),
 (('Cancelled', 'Flightled'), 0.0018871576502467822),
 (('Late', 'Flight'), 0.0009268289051507866),
 (('Cancelled', 'Flighted'), 0.0008374959986302288),
 (('Late', 'Flightr'), 0.0005769416879452687),
 (('JetBlue', 'Our'), 0.0005434418480000596),
 (('Booking', 'Problems'), 0.000539719643561703),
 (('Cancelled', 'Flight'), 0.000539719643561703),
 (('Our', 'fleet'), 0.000539719643561703),
 (('fleek', 'http'), 0.0005322752346849899)]

In [18]:
bigram_features = [bigram for (bigram, count) in scored[:2000]]
bigram_features[:30]

[('customer', 'service'),
 ('Cancelled', 'Flightled'),
 ('Late', 'Flight'),
 ('Cancelled', 'Flighted'),
 ('Late', 'Flightr'),
 ('JetBlue', 'Our'),
 ('Booking', 'Problems'),
 ('Cancelled', 'Flight'),
 ('Our', 'fleet'),
 ('fleek', 'http'),
 ('USAirways', 'AmericanAir'),
 ('Flight', 'Booking'),
 ('call', 'back'),
 ('united', 'thanks'),
 ('JetBlue', 'thanks'),
 ('gate', 'agent'),
 ('AmericanAir', 'thanks'),
 ('flight', 'united'),
 ('connecting', 'flight'),
 ('get', 'home'),
 ('flight', 'Cancelled'),
 ('last', 'night'),
 ('hours', 'Late'),
 ('USAirways', 'thanks'),
 ('hour', 'delay'),
 ('help', 'USAirways'),
 ('first', 'class'),
 ('flight', 'attendant'),
 ('reFlight', 'Booking'),
 ('united', 'yes')]

In [19]:
#defining features for each document using bigram features

def bi_document_features(document, bigram_features):
    document_words = list(nltk.bigrams(document))
    features = {}
    for word in bigram_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [20]:
featuresets2 = [(bi_document_features(d, bigram_features), c) for (d, c) in docs]

featuresets2[0]

({"contains(('customer', 'service'))": False,
  "contains(('Cancelled', 'Flightled'))": False,
  "contains(('Late', 'Flight'))": False,
  "contains(('Cancelled', 'Flighted'))": False,
  "contains(('Late', 'Flightr'))": False,
  "contains(('JetBlue', 'Our'))": False,
  "contains(('Booking', 'Problems'))": False,
  "contains(('Cancelled', 'Flight'))": False,
  "contains(('Our', 'fleet'))": False,
  "contains(('fleek', 'http'))": False,
  "contains(('USAirways', 'AmericanAir'))": False,
  "contains(('Flight', 'Booking'))": False,
  "contains(('call', 'back'))": False,
  "contains(('united', 'thanks'))": False,
  "contains(('JetBlue', 'thanks'))": False,
  "contains(('gate', 'agent'))": False,
  "contains(('AmericanAir', 'thanks'))": False,
  "contains(('flight', 'united'))": False,
  "contains(('connecting', 'flight'))": False,
  "contains(('get', 'home'))": False,
  "contains(('flight', 'Cancelled'))": False,
  "contains(('last', 'night'))": False,
  "contains(('hours', 'Late'))": False,

In [21]:
len(featuresets2)

14640

In [22]:
#Naive Bayes classifier with 3-fold cross validation for training on sentiments using bigram features

kf = KFold(n_splits = 3)
sum = 0

for train, test in kf.split(featuresets2):
    train_data2 = np.array(featuresets2)[train]
    test_data2 = np.array(featuresets2)[test]
    classifier2 = nltk.NaiveBayesClassifier.train(train_data2)
    sum += nltk.classify.accuracy(classifier2, test_data2)

acc2 = sum/3

In [23]:
#accuracy of classifier2

acc2

0.6756147540983607

In [24]:
#considering negation words with unigram approach

negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = False
        features['contains(NOT{})'.format(word)] = False
    
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['contains(NOT{})'.format(document[i])] = (document[i] in word_features)
        else:
            features['contains({})'.format(word)] = (word in word_features)
    return features

In [25]:
NOT_featuresets = [(NOT_features(d, word_features, negationwords), c) for (d, c) in docs]
NOT_featuresets[0][0]['contains(NOTlike)']

False

In [26]:
#Naive Bayes classifier with 3-fold cross validation for training on sentiments using negation words and unigram features

kf = KFold(n_splits = 3)
sum = 0

for train, test in kf.split(NOT_featuresets):
    train_data3 = np.array(NOT_featuresets)[train]
    test_data3 = np.array(NOT_featuresets)[test]
    classifier3 = nltk.NaiveBayesClassifier.train(train_data3)
    sum += nltk.classify.accuracy(classifier3, test_data3)

acc3 = sum/3

In [27]:
#accuracy of classifier3

acc3

0.7594262295081968

In [28]:
#determining the sentiments of the 1/4th of the comments using classifier1

pos_sent = []
neg_sent = []
neu_sent = []

title = []
author = []
country = []
total_pos = []
total_neg = []
total_neu = []

for i in range(0, int(len(covid['text'])/4)):
    sentences = nltk.sent_tokenize(covid['text'][i])
    pos_count = 0
    neg_count = 0
    neu_count = 0
    for sents in sentences:
        senti = classifier.classify(document_features(nltk.word_tokenize(sents), word_features))

        if senti == 'positive':
            pos_sent.append(sents)
            pos_count += 1

        elif senti == 'negative':
            neg_sent.append(sents)
            neg_count += 1
    
        else:
            neu_sent.append(sents)
            neu_count += 1
            
    title.append(covid['title'][i])
    author.append(covid['author'][i])
    country.append(covid['country'][i])
    
    total_pos.append(pos_count)
    total_neg.append(neg_count)
    total_neu.append(neu_count)

In [29]:
len(pos_sent)

106295

In [30]:
len(neg_sent)

302118

In [31]:
len(neu_sent)

175037

In [32]:
#POS tagging the positive sentences

tokens_pos = [nltk.word_tokenize(sent) for sent in pos_sent]
tags_pos = [nltk.pos_tag(tok) for tok in tokens_pos]

In [33]:
#POS tagging the negative sentences

tokens_neg = [nltk.word_tokenize(sent) for sent in neg_sent]
tags_neg = [nltk.pos_tag(tok) for tok in tokens_neg]

In [34]:
#finding most frequent positive words

import itertools

all_tags_pos = list(itertools.chain.from_iterable(tags_pos))

cfd_pos = nltk.ConditionalFreqDist(((tags_pos, word) for (word, tags_pos) in all_tags_pos))

In [35]:
#top 50 adjectives in positive sentences

list(cfd_pos['JJ'])[0:50]

['new',
 'Chinese',
 'other',
 'first',
 'last',
 '“',
 'good',
 's',
 'global',
 'medical',
 'many',
 'such',
 'great',
 'major',
 'social',
 'novel',
 'top',
 'much',
 'local',
 'positive',
 'economic',
 'high',
 'big',
 'second',
 'early',
 'few',
 'same',
 'next',
 'confirmed',
 'international',
 'Canadian',
 'safe',
 'recent',
 'important',
 'strong',
 'total',
 'further',
 'public',
 'negative',
 'free',
 'late',
 'epidemic',
 'central',
 'foreign',
 'own',
 'little',
 'potential',
 'American',
 'severe',
 '’']

In [36]:
#top 50 adverbs in positive sentences

list(cfd_pos['RB'])[0:50]

['also',
 'very',
 'well',
 'so',
 'as',
 'not',
 'just',
 'far',
 'here',
 'now',
 'much',
 'up',
 'back',
 'really',
 'even',
 'too',
 'first',
 "n't",
 'So',
 'ahead',
 'down',
 'still',
 'quickly',
 'always',
 'However',
 'especially',
 'later',
 'already',
 'closely',
 'away',
 'only',
 'often',
 'Meanwhile',
 'however',
 'recently',
 'forward',
 'soon',
 'again',
 'globally',
 'definitely',
 'nearly',
 'mostly',
 'then',
 'Not',
 'together',
 'Also',
 'there',
 'particularly',
 'Here',
 'Still']

In [37]:
#top 50 verbs in positive sentences

list(cfd_pos['VB'])[0:50]

['be',
 '“',
 'help',
 'see',
 'keep',
 'make',
 'have',
 'work',
 'continue',
 'take',
 'contain',
 'get',
 'prevent',
 'spread',
 'Read',
 'stay',
 'ensure',
 'go',
 'follow',
 'protect',
 'win',
 'support',
 'stop',
 'become',
 'do',
 'raise',
 'say',
 'identify',
 'remain',
 '”',
 'treat',
 'monitor',
 're',
 'See',
 'end',
 'give',
 'meet',
 'announce',
 'move',
 'face',
 'come',
 'reduce',
 'look',
 'report',
 'read',
 'feel',
 'Let',
 'Keep',
 'join',
 'start']

In [38]:
#finding most frequent negative words

all_tags_neg = list(itertools.chain.from_iterable(tags_neg))

cfd_neg = nltk.ConditionalFreqDist(((tags_neg, word) for (word, tags_neg) in all_tags_neg))

In [39]:
#top 50 adjectives in negative sentences

list(cfd_neg['JJ'])[0:50]

['new',
 'Chinese',
 'other',
 'last',
 'first',
 'global',
 'public',
 '“',
 'due',
 'many',
 'medical',
 's',
 'such',
 'novel',
 '’',
 'confirmed',
 'local',
 'international',
 'positive',
 'economic',
 'same',
 'next',
 'further',
 'early',
 'major',
 'British',
 'past',
 'central',
 'close',
 'several',
 'epidemic',
 'good',
 'deadly',
 'recent',
 'social',
 'full',
 'total',
 '2019-nCoV',
 'few',
 'second',
 'financial',
 'high',
 '”',
 'potential',
 'severe',
 'possible',
 'infected',
 'likely',
 'able',
 'own']

In [40]:
#top 50 verbs in negative sentences

list(cfd_neg['VB'])[0:50]

['be',
 'have',
 'get',
 'take',
 '“',
 'make',
 '[',
 'help',
 'do',
 'keep',
 'see',
 'go',
 'prevent',
 'contain',
 'stay',
 'work',
 'spread',
 'ensure',
 'continue',
 'leave',
 'provide',
 'find',
 'stop',
 'know',
 'come',
 'say',
 'protect',
 'avoid',
 'remain',
 'travel',
 'give',
 'use',
 'return',
 'put',
 '”',
 'reduce',
 'deal',
 'support',
 'buy',
 'bring',
 'close',
 'cause',
 'improve',
 'need',
 'allow',
 'respond',
 '’',
 'start',
 'try',
 'evacuate']

In [41]:
#top 50 adverbs in negative sentences

list(cfd_neg['RB'])[0:50]

['not',
 'also',
 'now',
 'still',
 'just',
 'so',
 'only',
 'even',
 'as',
 "n't",
 'back',
 'very',
 'well',
 'far',
 'already',
 'then',
 'ago',
 'here',
 'there',
 'nearly',
 'up',
 'yet',
 'too',
 'recently',
 'almost',
 'again',
 'first',
 'really',
 'down',
 'later',
 'currently',
 'However',
 'much',
 'So',
 'especially',
 'away',
 'ahead',
 'never',
 'right',
 'soon',
 'however',
 'previously',
 'together',
 'globally',
 'Meanwhile',
 'closely',
 'Now',
 'actually',
 'about',
 'long']

In [42]:
#creating a dataframe for title, cuthor, country and number of sentences

df = pd.DataFrame(list(zip(title, author, country, total_pos, total_neg, total_neu)), 
                       columns = ['title', 'author', 'country', 'n_pos_sents', 'n_neg_sents', 'n_neu_sents'])

In [43]:
df

Unnamed: 0,title,author,country,n_pos_sents,n_neg_sents,n_neu_sents
0,"Karnataka: Helplines, isolation wards set up f...",Udayavani,IN,2,4,2
1,Health dept. monitoring 24 people for possible...,,US,0,1,2
2,,jmccorm,US,0,2,0
3,Asian Markets Mostly Higher,rttnews.com,US,7,4,11
4,Tesla soars as bearish analysts left with litt...,Joe Easton,CA,2,14,8
...,...,...,...,...,...,...
26499,Australian Dollar Edges Higher as Aussie Home ...,Colin Lawrence,GB,6,9,12
26500,Loal actor uses humor to highlight anti-Asian ...,WCVB-TV,US,2,11,1
26501,"Eight days in Wuhan, cut off from the world",,ZA,15,41,15
26502,No plans to set up coronavirus quarantine site...,Katya Slepian,US,2,9,6


In [44]:
#writing the dataframe to csv file

df.to_csv('sentiments.csv', index = False)

In [45]:
pos_sent[:3]

['All district hospitals will have five beds isolated for patients carrying the virus.',
 '104 Arogya Sahayavani helpline run by the Health Department will take all calls related to the virus.',
 'The international agency noted that China was doing everything it could to contain the outbreak.']

In [46]:
neg_sent[:3]

['Bengaluru: Isolation wards in hospitals across Karnataka and helpline to take calls on coronavirus-related queries are ready to prevent any further spread of the virus after the first case in India was reported from Kerala yesterday.',
 'The Chief Secretary of the state government on Thursday held a meeting with the Additional Chief Secretary (Health), Health Commissioner, Mission Director of the National Health Mission and other health department officials and reviewed the state preparedness to tackle any cases of coronavirus whenever reported.',
 'Rajiv Gandhi Institute of Chest Diseases (RGICD) with 15 beds and Wenlock Hospital at Mangaluru with 10 beds have been selected for the treatment of the virus.']

In [47]:
neu_sent[:3]

['Along with this, at least ten private hospitals in Bengaluru will be setting up similar isolation wards.',
 'Udayavani English',
 'Aside from monitoring suspected cases, authorities are also preparing plans to repatriate Filipinos in the Chinese province of Hubei.']