In [1]:
import pandas as pd
import numpy as np
import PIL

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier


from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report 

In [2]:
df = pd.read_csv('external_labelled_tweets_clean.csv')
df

Unnamed: 0.1,Unnamed: 0,text,VADERsentiment
0,0,Gahan,Neutral
1,1,advice Talk neighbours family exchange phone n...,Positive
2,2,Coronavirus Australia Woolworths give elderly ...,Positive
3,3,"My food stock one empty ... PLEASE , ' panic ,...",Positive
4,4,"Me , ready go supermarket #COVID19 outbreak. N...",Negative
...,...,...,...
41152,41152,Airline pilots offering stock supermarket shel...,Neutral
41153,41153,Response complaint provided citing COVID 19 re...,Negative
41154,41154,You know getting tough rationing toilet paper ...,Positive
41155,41155,Is wrong smell hand sanitizer starting turn #c...,Positive


In [3]:
df = df.dropna()

In [4]:
df['VADERsentiment'].value_counts()

Positive    18940
Negative    15368
Neutral      6848
Name: VADERsentiment, dtype: int64

In [5]:
df['VADERsentiment'] = np.where(df['VADERsentiment'] == 'Positive', 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['VADERsentiment'] = np.where(df['VADERsentiment'] == 'Positive', 1, 0)


In [6]:
df

Unnamed: 0.1,Unnamed: 0,text,VADERsentiment
0,0,Gahan,0
1,1,advice Talk neighbours family exchange phone n...,1
2,2,Coronavirus Australia Woolworths give elderly ...,1
3,3,"My food stock one empty ... PLEASE , ' panic ,...",1
4,4,"Me , ready go supermarket #COVID19 outbreak. N...",0
...,...,...,...
41152,41152,Airline pilots offering stock supermarket shel...,0
41153,41153,Response complaint provided citing COVID 19 re...,0
41154,41154,You know getting tough rationing toilet paper ...,1
41155,41155,Is wrong smell hand sanitizer starting turn #c...,1


In [7]:
binary_df = df

In [8]:
binary_df['VADERsentiment'].value_counts()

0    22216
1    18940
Name: VADERsentiment, dtype: int64

In [9]:
pos_samples = binary_df[binary_df['VADERsentiment']==1]
neg_samples = binary_df[binary_df['VADERsentiment']==0]

In [10]:
balanced_binary_df = pd.concat([pos_samples, neg_samples])

In [11]:
balanced_binary_df

Unnamed: 0.1,Unnamed: 0,text,VADERsentiment
1,1,advice Talk neighbours family exchange phone n...,1
2,2,Coronavirus Australia Woolworths give elderly ...,1
3,3,"My food stock one empty ... PLEASE , ' panic ,...",1
5,5,As news region first confirmed COVID 19 case c...,1
6,6,Cashier grocery store sharing insights #Covid ...,1
...,...,...,...
41147,41147,Y really shitting much home #COVID19 #coronavi...,0
41149,41149,Still shocked number #Toronto supermarket empl...,0
41152,41152,Airline pilots offering stock supermarket shel...,0
41153,41153,Response complaint provided citing COVID 19 re...,0


In [12]:
balanced_binary_df['VADERsentiment'].value_counts()

0    22216
1    18940
Name: VADERsentiment, dtype: int64

In [13]:
tweets = balanced_binary_df['text']
sentiments = balanced_binary_df['VADERsentiment']

In [14]:
X_train, X_test, y_train, y_test  = train_test_split(tweets, sentiments, test_size=0.2, random_state=42)

In [15]:
cv = CountVectorizer(analyzer='word', ngram_range=(1,2)) 

In [16]:
cv_train_features = cv.fit_transform(X_train) 

In [17]:
cv_test_features = cv.transform(X_test)

In [58]:
# Support vector machine model
svm = SGDClassifier(loss='hinge', l1_ratio=0.15, max_iter=300, n_jobs=4, random_state=301)
# Decision tree model
dt = DecisionTreeClassifier(criterion = 'gini',splitter = 'best', random_state = 0)
# Random forest model
rf = RandomForestClassifier(n_estimators = 150, criterion = 'gini', random_state = 0)
# Logistic regression model
lr = LogisticRegression(max_iter=10000, C=1, multi_class='auto', solver='newton-cg')
# Kernel svm model
kernel_svm = SVC(kernel = 'linear', random_state = 0, gamma='scale')
# K nearest neighbours model
knn = KNeighborsClassifier(n_neighbors = 1, metric = 'minkowski', p = 2)

In [62]:
classifier = VotingClassifier(estimators=[('knn',knn),('kernel_svm',kernel_svm)], voting='hard')

In [63]:
def train_predict_model(classifier,  train_features, train_labels,  test_features, test_labels):
    
    # build the model    
    classifier.fit(train_features, train_labels)
    # make predictions using model
    y_pred = classifier.predict(test_features) 
    
    return y_pred

In [None]:
y_pred = train_predict_model(classifier=classifier,
                             train_features=cv_train_features,
                             train_labels= y_train,
                             test_features=cv_test_features,
                             test_labels= y_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print out the results
print('Confusion Matrix:', '\n')
print(cm, '\n')
print('Accuracy = ', accuracy)
print('F1 Score = ',f1, '\n')
print(classification_report(y_test, y_pred))