In [1]:
import pandas as pd
import numpy as np
import PIL

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report 

In [2]:
df = pd.read_csv('labelled_tweets_clean.csv')
df

Unnamed: 0.1,Unnamed: 0,Tweet,VADERsentiment
0,0,must blame mr ekandjo ! everything belong swapo .,Negative
1,1,call national flag say belong swapo ? contradi...,Negative
2,2,immanuel benefits swapo wants stay power till ...,Positive
3,3,swanu flag inspired swapo flag ?,Positive
4,4,swapo wants rule enrich pockets like pm. kak b...,Positive
...,...,...,...
143,143,"blame public relations team swapo , tell good ...",Positive
144,144,"mekondjo cases , complaints even taken serious...",Negative
145,145,"swapo ' oshana regional coordinator , samuel n...",Positive
146,146,"cde , ask pres question prepare swapo elective...",Positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      148 non-null    int64 
 1   Tweet           148 non-null    object
 2   VADERsentiment  148 non-null    object
dtypes: int64(1), object(2)
memory usage: 3.6+ KB


In [4]:
df = df.dropna()

In [5]:
df['VADERsentiment'].value_counts()

Positive    79
Negative    44
Neutral     25
Name: VADERsentiment, dtype: int64

In [6]:
df['VADERsentiment'] = np.where(df['VADERsentiment'] == 'Positive', 1, 0)

In [7]:
binary_df = df

In [8]:
binary_df

Unnamed: 0.1,Unnamed: 0,Tweet,VADERsentiment
0,0,must blame mr ekandjo ! everything belong swapo .,0
1,1,call national flag say belong swapo ? contradi...,0
2,2,immanuel benefits swapo wants stay power till ...,1
3,3,swanu flag inspired swapo flag ?,1
4,4,swapo wants rule enrich pockets like pm. kak b...,1
...,...,...,...
143,143,"blame public relations team swapo , tell good ...",1
144,144,"mekondjo cases , complaints even taken serious...",0
145,145,"swapo ' oshana regional coordinator , samuel n...",1
146,146,"cde , ask pres question prepare swapo elective...",1


In [9]:
binary_df['VADERsentiment'].value_counts()

1    79
0    69
Name: VADERsentiment, dtype: int64

In [10]:
pos_samples = binary_df[binary_df['VADERsentiment']==1]
neg_samples = binary_df[binary_df['VADERsentiment']==0]

In [11]:
balanced_binary_df = pd.concat([pos_samples, neg_samples])

In [12]:
balanced_binary_df

Unnamed: 0.1,Unnamed: 0,Tweet,VADERsentiment
2,2,immanuel benefits swapo wants stay power till ...,1
3,3,swanu flag inspired swapo flag ?,1
4,4,swapo wants rule enrich pockets like pm. kak b...,1
12,12,swapo elite becoming desperate understand days...,1
13,13,"swapo , put people swapo power ? national flag...",1
...,...,...,...
135,135,swapo grandfather,0
136,136,china russia cold war time see eye eye. china ...,0
140,140,swapo founded day,0
144,144,"mekondjo cases , complaints even taken serious...",0


In [13]:
balanced_binary_df['VADERsentiment'].value_counts()

1    79
0    69
Name: VADERsentiment, dtype: int64

In [14]:
tweets = balanced_binary_df['Tweet']
sentiments = balanced_binary_df['VADERsentiment']

In [15]:
X_train, X_test, y_train, y_test  = train_test_split(tweets, sentiments, test_size=0.2, random_state=42)

In [16]:
cv = CountVectorizer(analyzer='word', ngram_range=(1,2)) 

In [17]:
cv_train_features = cv.fit_transform(X_train) 

In [18]:
cv_test_features = cv.transform(X_test)

In [19]:
# Support vector machine model
svm = SGDClassifier(loss='hinge', l1_ratio=0.15, max_iter=300, n_jobs=4, random_state=101)
# Decision tree model
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
# Random forest model
rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
# Logistic regression model
lr = LogisticRegression(max_iter=100, C=1, multi_class='auto', solver='lbfgs')
# Kernel svm model
kernel_svm = SVC(kernel = 'rbf', random_state = 0, gamma='scale')
# K nearest neighbours model
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)

In [20]:
## Ensemble Technique

In [21]:
classifier = VotingClassifier(estimators=[('svm', svm), ('dt', dt),('rf', rf), ('lr', lr), ('kernel_svm', kernel_svm), ('knn', knn)], voting='hard')

NameError: name 'VotingClassifier' is not defined

In [None]:
def train_predict_model(classifier,  train_features, train_labels,  test_features, test_labels):
    
    # build the model    
    classifier.fit(train_features, train_labels)
    # make predictions using model
    y_pred = classifier.predict(test_features) 
    
    return y_pred

In [None]:
y_pred = train_predict_model(classifier=classifier,
                             train_features=cv_train_features,
                             train_labels= y_train,
                             test_features=cv_test_features,
                             test_labels= y_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print out the results
print('Confusion Matrix:', '\n')
print(cm, '\n')
print('Accuracy = ', accuracy)
print('F1 Score = ',f1, '\n')
print(classification_report(y_test, y_pred))

In [None]:
new_tweets = df['Tweet'].iloc[10:30]

In [None]:
newtext_features = cv.transform(new_tweets)

In [None]:
new_result = classifier.predict(newtext_features)
print(new_result)

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from simpletransformers.classification import ClassificationModel

import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import random
import numpy as np
import torch
from sklearn.model_selection import KFold

import logging
from pathlib import Path

In [None]:
balanced_binary_df['VADERsentiment'].value_counts()

In [None]:
df['VADERsentiment'].value_counts()

In [None]:
train_df, val_df = train_test_split(balanced_binary_df, test_size=0.2,  random_state=42)

In [None]:
train_df['VADERsentiment'].value_counts()
val_df['VADERsentiment'].value_counts()