# Sentiment Analysis Covid-19 tweets

In [32]:
import numpy as np 
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

plt.style.use('seaborn-poster')
%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 6
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)

## Clasification Problem  
Sentiment analysis is a supervised classification problem because:  
1. We want to estimate a class('sad', 'joy', 'fear', 'anger') for a given tweet.
2. We train the model with labeled data.

In [2]:
tweets_df = pd.read_csv("covid19_tweets.csv", parse_dates=['user_created', 'date'])
tweets_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [3]:
labeled_data = pd.read_csv('labeled_sentiment.csv')
labeled_data.head(10)

Unnamed: 0,sentiment,text
0,sad,agree the poor in india are treated badly thei...
1,joy,if only i could have spent the with this cutie...
2,joy,will nature conservation remain a priority in ...
3,sad,coronavirus disappearing in italy show this to...
4,sad,uk records lowest daily virus death toll since...
5,fear,joe biden's coronavirus web address lands on a...
6,sad,respected sir in our telangana all private tea...
7,fear,so is also 20 times more lethal than influenza...
8,sad,thull is passing the most dangerous and ultra ...
9,sad,thull is passing the most dangerous and ultra ...


In [4]:
labeled_data['sentiment'].value_counts()

fear     801
sad      795
anger    767
joy      727
Name: sentiment, dtype: int64

### Text pre-processing for machine learning  
- converting to lowercase
- remove text in square brackets,
- remove links,
- remove punctuation
- remove words containing numbers
- remove Punctuation
- remove stopwords
- Stemming
- Lemmatization
- Decontraction

In [5]:
import nltk
import re
import string

def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    return text 

def clean_text(text):
    text = text.lower() # to lower case
    text = re.sub('\[.*?\]', '', text) # text in brakets
    text = re.sub('https?://\S+|www\.\S+', '', text) # links
    text = re.sub('<.*?>+', '', text)  # html
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # punctuation
    text = re.sub('\n', '', text) # end of line
    text = re.sub('\w*\d\w*', '', text) # words with numbers
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = decontraction(text)
    return text

In [6]:
tweets_df['text'] = tweets_df['text'].apply(clean_text)
labeled_data['text'] = labeled_data['text'].apply(clean_text)

### Create training/test data sets

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

train, test = train_test_split(labeled_data, test_size=0.2, random_state=0, stratify=labeled_data['sentiment'].values)
print("train shape : ", train.shape)
print("test shape : ", test.shape)

train shape :  (2472, 2)
test shape :  (618, 2)


In [14]:
nltk.download('stopwords')
stop = list(stopwords.words('english'))
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\109666\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
vectorizer = CountVectorizer(decode_error = 'replace',stop_words = stop)

x_train = vectorizer.fit_transform(train.text.values)
x_test = vectorizer.transform(test.text.values)

y_train = train.sentiment.values
y_test = test.sentiment.values

print("x_train.shape : ", x_train.shape)
print("x_test.shape : ", x_test.shape)
print("y_train.shape : ", y_train.shape)
print("y_test.shape : ", y_test.shape)

x_train.shape :  (2472, 8551)
x_test.shape :  (618, 8551)
y_train.shape :  (2472,)
y_test.shape :  (618,)


## Train a Logistic Regression Model with Labeled Data Set

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
logreg = LogisticRegression()

logreg.fit(x_train, y_train)

test_prediction = logreg.predict(x_test)
logreg_accuracy = accuracy_score(y_test, test_prediction)
print("Training accuracy Score    : ",logreg.score(x_train,y_train))
print("Test accuracy Score : ",logreg_accuracy )
print(classification_report(test_prediction, y_test))

Training accuracy Score    :  0.9955501618122977
Test accuracy Score :  0.6957928802588996
              precision    recall  f1-score   support

       anger       0.59      0.64      0.61       143
        fear       0.58      0.63      0.60       148
         joy       0.85      0.71      0.77       173
         sad       0.77      0.80      0.79       154

    accuracy                           0.70       618
   macro avg       0.70      0.69      0.69       618
weighted avg       0.71      0.70      0.70       618



## Predict Sentiment on Covid-19 Tweets

In [33]:
covid_tweets_matrix = vectorizer.transform(tweets_df["text"].values)

prediction = logreg.predict(covid_tweets_matrix)

covid_tweets_sentiment = pd.DataFrame({"sentiment": prediction, "text": tweets_df["text"]})
covid_tweets_sentiment.head(20)

Unnamed: 0,sentiment,text
0,fear,if i smelled the scent of hand sanitizers today on someone in the past i would think they were so intoxicated that…
1,joy,hey yankees yankeespr and mlb wouldnt it have made more sense to have the players pay their respects to the a…
2,anger,wdunlap realdonaldtrump trump never once claimed was a hoax we all claim that this effort to…
3,anger,brookbanktv the one gift has give me is an appreciation for the simple things that were always around me…
4,joy,july media bulletin on novel coronavirusupdates drsyedsehrish airnewsalerts ani…
5,sad,coronavirus deaths continue to rise its almost as bad as it ever was politicians and businesses want…
6,joy,how will change work in general and recruiting specifically via proactivetalent recruiting…
7,joy,you now have to wear face coverings when out shopping this includes a visit to your local community pharmacy…
8,joy,praying for good health and recovery of chouhanshivraj
9,joy,pope as god prophet sadhu sundar selvaraj watch here at hurricanehanna …
