In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import warnings
%matplotlib inline 

warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Load datasets

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/covid19.csv', encoding='utf-8')


In [None]:
df.info()

In [None]:
#remove pattern in the dataset
def remove_pattern(text):
  text = re.findall(pattern, df)
  for word in text:
    df = re.sub(word, "", df)
  
  return df
  df

In [None]:
#create clean function
def cleantxt(text):
  text = re.sub(r'@[A-Za-z0-9]+', '', text)
  text = re.sub(r'#', '', text)
  text = re.sub(r'RT[\s]+', '', text)
  text = re.sub(r'https?:\/\/\S+', '', text)
  text = re.sub(r'$', '', text)
  text = re.sub(r'\n', '', text)
  text = re.sub(r'0-9[A-Za-z]+', '', text)
  text = text.lower()
 


  return text
  
df['tweets'] = df['tweets'].apply(cleantxt)

df.head()

In [None]:
df['clean_text'] = df['tweets'].str.replace("[^a-zA-Z#]", " ")
df.head()

In [None]:
#remove stop words
df['clean_text'] = df['clean_text'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
df.head()

In [None]:
#tokenization
tokenized = df['clean_text'].apply(lambda x: x.split())
tokenized.head()

In [None]:
#stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

tokenized = tokenized.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized.head()

0    [preparativo, para, nuestra, patria, querida, ...
1    [dan, cadr, strat, vaccinal, lutt, covid, larg...
2    [nahuzubillahminzalik, allah, protect, from, t...
3    [covid, plu, ultra, gobierno, rescata, aerol, ...
4    [atenc, quer, taro, pasar, escenario, partir, ...
Name: clean_text, dtype: object

In [None]:
df['clean_text'] = tokenized
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,clean_text
0,0,: en los preparativos para el envío a nuestra ...,"[preparativo, para, nuestra, patria, querida, ..."
1,1,dans le cadre de la stratégie vaccinale de lut...,"[dan, cadr, strat, vaccinal, lutt, covid, larg..."
2,2,": nahuzubillahminzalik....""may allah protect u...","[nahuzubillahminzalik, allah, protect, from, t..."
3,3,: covid19 plus ultra | 🔴💰el gobierno rescata c...,"[covid, plu, ultra, gobierno, rescata, aerol, ..."
4,4,_qro: atención | querétaro pasará a escenario ...,"[atenc, quer, taro, pasar, escenario, partir, ..."


In [None]:
df['clean_text'] = df['tweets'].str.replace("[^a-zA-Z#]", " ")
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,clean_text
0,0,: en los preparativos para el envío a nuestra ...,en los preparativos para el env o a nuestra ...
1,1,dans le cadre de la stratégie vaccinale de lut...,dans le cadre de la strat gie vaccinale de lut...
2,2,": nahuzubillahminzalik....""may allah protect u...",nahuzubillahminzalik may allah protect u...
3,3,: covid19 plus ultra | 🔴💰el gobierno rescata c...,covid plus ultra el gobierno rescata c...
4,4,_qro: atención | querétaro pasará a escenario ...,qro atenci n quer taro pasar a escenario ...


In [None]:
#create a function to get subjectivity

from textblob import TextBlob

def getsubjectivity(text):
  return TextBlob(text).sentiment.subjectivity


#create a function to get polarity
def getpolarity(text):
  return TextBlob(text).sentiment.polarity

#create two new columns
df['subjectivity'] = df['tweets'].apply(getsubjectivity)
df['polarity'] = df['tweets'].apply(getpolarity)


#show new dataframe
df

In [None]:
#sentiment text function
def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score ==0:
        return 'Neutral'
    else:
        return 'Positive'
    
#add sentiment column to dataframe
df['sentiment'] = df['polarity'].apply(getSentiment)
df[200:500]

In [None]:
#Visualization and exploration of dataset
allWords = ' '.join( [sentence for sentence in df['clean_text']] )

from wordcloud import WordCloud
wordCloud = WordCloud(width = 800, height = 500, random_state=42, max_font_size=100).generate(allWords)

#plot wordcloud
plt.imshow(wordCloud, interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
#positive words visualization
allwords = " ".join([sentence for sentence in df['clean_text'][df['polarity'] > 0]])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(allwords)


#wordcloud

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#positive words visualization
allwords = " ".join([sentence for sentence in df['clean_text'][df['polarity'] < 0]])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(allwords)


#wordcloud

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#plot wordcloud 
allWords = ' '.join( [twts for twts in df['clean_text']] )
wordCloud = WordCloud(width = 500, height = 300, random_state=23, max_font_size=119).generate(allWords)

plt.imshow(wordCloud, interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
#plot polarity and subjectivity
plt.figure(figsize=(8,6))
for i in range(0, df.shape[0]):
  plt.scatter(df['polarity'][i], df['subjectivity'][i], color='Blue' )

plt.title('Sentiment Analysis')
plt.xlabel('polarity')
plt.ylabel('subjectivity')
plt.show()

In [None]:
#plot and visualize counts
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Counts')

df['sentiment'].value_counts().plot(kind='bar')
plt.show()

In [None]:
#creating a label for the tweets
def createlabel(score):
  if score < 0:
    return -1
  elif score >0 and score < 1:
    return 1
  else:
    return 0


#add label top dataframe
df['label'] = df['polarity'].apply(createlabel)
df[200:215]

In [None]:
#input split and feature extraction
from sklearn.feature_extraction.text import CountVectorizer
bow_vect = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vect.fit_transform(df['clean_text'])

In [None]:
# bow[0].toarray()

In [None]:
bow

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(bow, df['label'], random_state=42, test_size=0.30) 

In [None]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
#model training

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

In [None]:
# model training
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
# testing
pred = model.predict(x_test)
accuracy_score(y_test, pred)

In [None]:
# use probabilty to get output
pred_prob = model.predict_proba(x_test)
pred = pred_prob[:,1] >= 0.3
pred = pred.astype(np.int)

#f1_score(y_test, pred)
accuracy_score(y_test, pred)

In [None]:
pred_prob[0]

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

BNmodel = BernoulliNB(alpha = 2)
SVCmodel = LinearSVC()
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)

In [None]:
# model training BNB
model = BernoulliNB()
model.fit(x_train, y_train)

In [None]:
# testing BNB
model.fit(x_test, y_test)
pred = model.predict(x_test)
accuracy_score(y_test, pred)

In [None]:
# model training LSVC
model2 = LinearSVC()
model2.fit(x_train, y_train)

In [None]:
# testing LSVM
model2.fit(x_test, y_test)
pred = model2.predict(x_test)
accuracy_score(y_test, pred)

In [None]:
# second split
train, test = train_test_split(df, test_size = 0.2, random_state=42)

In [None]:
x_train = train['clean_text']
x_test = test['clean_text']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vector = TfidfVectorizer(use_idf=True)

In [None]:
x_train = vector.fit_transform(x_train)
x_test = vector.transform(x_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
def models(model):
  y_pred = model.predict(x_test)

  accuracy = accuracy_score(y_pred, x_test['sentiment'])
  recall = recall_score(y_pred, x_test['sentiment'].pos_label='negative')
  precision = precision_score(y_pred, test['sentiment'].pos_label='negative')

  return (accuracy, recall, precision)