# Vaccine Sentiment Classification
*by Nefeli Tavoulari*

#### In this notebook I am using a dataset provided by Dr. Saptarshi Ghosh and Soham Poddar from the Department of Computer Science and Engineering, IIT Kharagpur, India.

## Install Dependencies

In [6]:
!sudo pip install --upgrade pip
!pip install scikit-learn
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install langdetect



## Import Packages

In [7]:
%matplotlib inline
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from google.colab import files
import io
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import re
import langdetect

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Some Globals

In [8]:
  porter = PorterStemmer()
  stop_words = set(stopwords.words('english'))

## Upload dataset - Create dataframe

In [9]:
uploaded = files.upload()

Saving vs_train.csv to vs_train (10).csv


In [10]:
df = pd.read_csv(io.BytesIO(uploaded['vs_train.csv']))
print(df)

       Unnamed: 0                                              tweet  label
0               0  Sip N Shop Come thru right now #Marjais #Popul...      0
1               1  I don't know about you but My family and I wil...      1
2               2  @MSignorile Immunizations should be mandatory....      2
3               3  President Obama spoke in favor of vaccination ...      0
4               4  "@myfoxla: Arizona monitoring hundreds for mea...      0
...           ...                                                ...    ...
15971       15971  @Salon if u believe the anti-vax nutcases caus...      1
15972       15972  How do you feel about parents who don't #vacci...      0
15973       15973  70 Preschoolers Tested for Measles in Simi Val...      0
15974       15974  Finance Minister: Budget offers room to procur...      0
15975       15975  Are you up to date on vaccines? Take CDC’s vac...      2

[15976 rows x 3 columns]


## Remove empty tweets

In [11]:
df.dropna(subset = ["tweet"], inplace=True)
df.drop_duplicates(subset = ["tweet"], inplace=True)
df.shape

(15881, 3)

## Check dataset balance



In [13]:
df.groupby("label")["tweet"].count()

label
0    7385
1    2070
2    6426
Name: tweet, dtype: int64

## Text Preprocessing

In [14]:
def clean_text(text):
  if text is None:
    return None
  if len(text) == 0:
    return None
    
  # remove tweets written in languages different from english
  try:
      lang = langdetect.detect(text)
  except:
      lang = 'Other'
  if lang != 'en':
    return None
    

  # remove special characters ,white spaces, urls
  text = text.lower()
  text = text.strip() 
  text = re.sub(r'[^\w\s]',' ',text)
  text = re.sub(r'\_',' ',text)
  text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

  # perform stemming
  cleaned_text = ""
  for word in text.split() :
    temp = porter.stem(word)
    cleaned_text += (temp + " ")

  # tokenize
  #tokens = word_tokenize(cleaned_text)
  #cleaned_text = [word for word in tokens if not word.lower() in stop_words]
 

  if cleaned_text is None:
    return None
  return cleaned_text


In [15]:
cleaned_text = []
for line in df["tweet"]:
  cleaned_text.append(clean_text(line))

## Remove empty tweets again

In [16]:
df = df.assign(clean_tweet = lambda x: cleaned_text)
df.dropna(subset = ["clean_tweet"], inplace=True)
df

Unnamed: 0.1,Unnamed: 0,tweet,label,clean_tweet
0,0,Sip N Shop Come thru right now #Marjais #Popul...,0,sip n shop come thru right now marjai popularn...
1,1,I don't know about you but My family and I wil...,1,i don t know about you but my famili and i wil...
2,2,@MSignorile Immunizations should be mandatory....,2,msignoril immun should be mandatori period in ...
3,3,President Obama spoke in favor of vaccination ...,0,presid obama spoke in favor of vaccin for chil...
4,4,"""@myfoxla: Arizona monitoring hundreds for mea...",0,myfoxla arizona monitor hundr for measl link t...
...,...,...,...,...
15971,15971,@Salon if u believe the anti-vax nutcases caus...,1,salon if u believ the anti vax nutcas caus mea...
15972,15972,How do you feel about parents who don't #vacci...,0,how do you feel about parent who don t vaccin ...
15973,15973,70 Preschoolers Tested for Measles in Simi Val...,0,70 preschool test for measl in simi valley htt...
15974,15974,Finance Minister: Budget offers room to procur...,0,financ minist budget offer room to procur covi...


In [17]:
df.shape

(15491, 4)

In [19]:
cv = CountVectorizer(stop_words='english')
cvw = cv.fit_transform(df['clean_tweet'])
dfc = pd.DataFrame(cvw.A,columns=cv.get_feature_names())
dfc

In [25]:
tf_idf = TfidfTransformer(smooth_idf=True, use_idf=True)
tf_idf_v = tf_idf.fit_transform(cvw)
pd.DataFrame(tf_idf_v.todense(),columns=cv.get_feature_names())

Unnamed: 0,00,000,000000,00006,0003,0006,000x,007madridista,00am,00pm,01,010khyjph1,01thsog5d0,02,021bg8gl,024,02am,03,032,036,03sns2qqfi,03zhghym8l,04,040,0442,04mlgh9pbb,04p2vqlysb,04pw9u6m93,05,05am,05io7ionwo,05lszh8zjj,06,060,06uas6bs7h,07,07v2faqimb,07xphgl1am,08,08105,...,zxpijzz7qa,zxscyqzz99,zy0tqwfjj4,zy1qpuyvn3,zy5ajked2l,zycov,zydu,zynpkmobqq,zyrbgtrvzn,zyvls0sx2u,zyxzd4wgyj,zz34huixfj,zzbgbv6n8t,zzdvhtrpo,zzelda01,zzpko9jc9i,zzseetiiss,zztpipx8r5,zzuxc43tzm,zzvruknuru,zzyqmzaz,zzzpk5rk,zzzquil,есть,написать,оbama,россия,стране,чем,לאתעמודעלדםרעך,روسيا,كرونا,आई,इस,पहल,बड,शखबर,सकत,समस,呆れ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
