In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#to data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#NLP tools
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

#train split and fit models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#model selection
from sklearn.metrics import confusion_matrix, accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!cp gdrive/MyDrive/Imbalance_prepare/imbalance_prepared_DPM_preprocessing.pkl .
import pandas as pd

In [4]:
dataset = pd.read_pickle("imbalance_prepared_DPM_preprocessing.pkl")
dataset.head(10)

Unnamed: 0,par_id,sentence,label
6443,7132,society is vulnerable to extreme weather the u...,0
2503,2781,in italy a populist antiestablishment five sta...,0
1203,10142,the absurdity of wealth discrepancy and socia...,1
5265,5810,downtown pittsburgh was crowded with slovaks ...,0
1364,763,of the result now is the massive exodus of af...,1
2666,2882,form of relief to those desperately in need o...,1
2582,2870,rome s housing crunch predated raggi s tenure ...,0
988,1542,was born hivpositive he s healthy but when sa...,1
337,374,in the uk racist comments or acts are not open...,0
8854,9786,stf even fired tear gas into the wards with di...,0


In [5]:
dataset.shape

(12236, 3)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12236 entries, 6443 to 1337
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   par_id    12236 non-null  object
 1   sentence  12236 non-null  object
 2   label     12236 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 382.4+ KB


In [7]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,12236.0,0.225564,0.41797,0.0,0.0,0.0,0.0,1.0


In [8]:
dt_trasformed = dataset[['label', 'sentence']]
y = dt_trasformed.iloc[:, :-1].values

In [9]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
y = np.array(ct.fit_transform(y))

In [10]:
y[:10]

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [11]:
print(y)

[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]


In [12]:
y_df = pd.DataFrame(y)

In [13]:
np.array(y_df[0])

array([1., 1., 0., ..., 1., 0., 1.])

In [14]:
np.array(y_df[1])

array([0., 0., 1., ..., 0., 1., 0.])

In [15]:
dataset['sentence'].values

array(['society is vulnerable to extreme weather the united nations body whose job it is to assess the science on climate change says the north atlantic and western north pacific will see more strong storms like typhoon haiyan that tore through the philippines in 2013 in europe heat waves like the 2003 event which killed 70000 people are already 10 times more likely than a decade ago and this pattern is set to continue scientists also know that warmer air will mean rainfall in heavier bursts while higher seas will make storms more likely to breach coastal flood defenses ',
       'in italy a populist antiestablishment five star movement m5s that contested the march 2018 parliamentary elections under the slogan participate choose change and a regionalist populist league that contested under the slogan italians first formed an alliance this month to form government the election environment also focused on the apparently irreversible decline of economy persistent high unemployment and cor

In [16]:
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(dataset['sentence'].values).toarray()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y_df[1], test_size = 0.30, random_state = 0)

Finding the best models to predict 

Naive Bayes



In [18]:
classifier_np = GaussianNB()
classifier_np.fit(X_train, y_train)

GaussianNB()

Decision Tree

In [19]:
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

KNN

In [20]:
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

KNeighborsClassifier()

Logistic Regression



In [21]:
classifier_lr = LogisticRegression(random_state = 0)
classifier_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=0)

Random Forest



In [22]:
classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

SVM Classifier



In [23]:
classifier_svm = svm.SVC()
classifier_svm.fit(X_train, y_train)

SVC()

XGBoost Classifier



In [24]:
classifier_xgb= XGBClassifier()
classifier_xgb.fit(X_train, y_train)

XGBClassifier()

Making the Confusion Matrix for each model


In [25]:
#Naive Bayes
y_pred_np = classifier_np.predict(X_test)
cm = confusion_matrix(y_test, y_pred_np)
print(cm)

[[1515 1304]
 [  91  761]]


In [26]:
#KNN
y_pred_knn = classifier_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred_knn)
print(cm)

[[2567  252]
 [ 504  348]]


In [27]:
#XGBoost Classifier
y_pred_xgb = classifier_xgb.predict(X_test)
cm = confusion_matrix(y_test, y_pred_xgb)
print(cm)

[[2786   33]
 [ 665  187]]


In [28]:
#SVM
y_pred_svm = classifier_svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred_svm)
print(cm)

[[2787   32]
 [ 401  451]]


In [29]:
#Logistic Regression
y_pred_lr=classifier_lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred_lr)
print(cm)

[[2602  217]
 [ 302  550]]


In [30]:
#Decision Tree
y_pred_dt = classifier_dt.predict(X_test)
cm = confusion_matrix(y_test, y_pred_dt)
print(cm)

[[2493  326]
 [ 219  633]]


In [31]:
#Random Florest
y_pred_rf = classifier_rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rf)
print(cm)

[[2785   34]
 [ 332  520]]


In [32]:
rf_score = accuracy_score(y_test, y_pred_rf)
knn_score = accuracy_score(y_test, y_pred_knn)
svm_score = accuracy_score(y_test, y_pred_svm)
xgb_score = accuracy_score(y_test, y_pred_xgb)
lr_score = accuracy_score(y_test, y_pred_lr)
dt_score = accuracy_score(y_test, y_pred_dt)
np_score = accuracy_score(y_test, y_pred_np)

print('Random Forest Accuracy: ', str(rf_score))
print('K Nearest Neighbours Accuracy: ', str(knn_score))
print('Support Vector Machine Accuracy: ', str(svm_score))
print('XGBoost Classifier Accuracy: ', str(xgb_score))
print('Logistic Regression Accuracy: ',str(lr_score))
print('Decision Tree Accuracy: ', str(dt_score))
print('Naive Bayes Accuracy: ', str(np_score))

Random Forest Accuracy:  0.9002996458730591
K Nearest Neighbours Accuracy:  0.7940615636066467
Support Vector Machine Accuracy:  0.8820484881503677
XGBoost Classifier Accuracy:  0.8098610732770363
Logistic Regression Accuracy:  0.8586216289839281
Decision Tree Accuracy:  0.8515390901661672
Naive Bayes Accuracy:  0.6199945518932171


In [33]:
from sklearn.metrics import classification_report, confusion_matrix
class_names = ['Not Patronizing','Is Patronizing']
print(classification_report(y_test, y_pred_rf, target_names=class_names))
print(confusion_matrix(y_test, y_pred_rf))

                 precision    recall  f1-score   support

Not Patronizing       0.89      0.99      0.94      2819
 Is Patronizing       0.94      0.61      0.74       852

       accuracy                           0.90      3671
      macro avg       0.92      0.80      0.84      3671
   weighted avg       0.90      0.90      0.89      3671

[[2785   34]
 [ 332  520]]


In [34]:
print(classification_report(y_test, y_pred_knn, target_names=class_names))
print(confusion_matrix(y_test, y_pred_knn))

                 precision    recall  f1-score   support

Not Patronizing       0.84      0.91      0.87      2819
 Is Patronizing       0.58      0.41      0.48       852

       accuracy                           0.79      3671
      macro avg       0.71      0.66      0.68      3671
   weighted avg       0.78      0.79      0.78      3671

[[2567  252]
 [ 504  348]]


In [35]:
print(classification_report(y_test, y_pred_svm, target_names=class_names))
print(confusion_matrix(y_test, y_pred_svm))

                 precision    recall  f1-score   support

Not Patronizing       0.87      0.99      0.93      2819
 Is Patronizing       0.93      0.53      0.68       852

       accuracy                           0.88      3671
      macro avg       0.90      0.76      0.80      3671
   weighted avg       0.89      0.88      0.87      3671

[[2787   32]
 [ 401  451]]


In [36]:
print(classification_report(y_test, y_pred_xgb, target_names=class_names))
print(confusion_matrix(y_test, y_pred_xgb))

                 precision    recall  f1-score   support

Not Patronizing       0.81      0.99      0.89      2819
 Is Patronizing       0.85      0.22      0.35       852

       accuracy                           0.81      3671
      macro avg       0.83      0.60      0.62      3671
   weighted avg       0.82      0.81      0.76      3671

[[2786   33]
 [ 665  187]]


In [37]:
print(classification_report(y_test, y_pred_lr, target_names=class_names))
print(confusion_matrix(y_test, y_pred_lr))

                 precision    recall  f1-score   support

Not Patronizing       0.90      0.92      0.91      2819
 Is Patronizing       0.72      0.65      0.68       852

       accuracy                           0.86      3671
      macro avg       0.81      0.78      0.79      3671
   weighted avg       0.85      0.86      0.86      3671

[[2602  217]
 [ 302  550]]


In [38]:
print(classification_report(y_test, y_pred_dt, target_names=class_names))
print(confusion_matrix(y_test, y_pred_dt))

                 precision    recall  f1-score   support

Not Patronizing       0.92      0.88      0.90      2819
 Is Patronizing       0.66      0.74      0.70       852

       accuracy                           0.85      3671
      macro avg       0.79      0.81      0.80      3671
   weighted avg       0.86      0.85      0.85      3671

[[2493  326]
 [ 219  633]]


In [40]:
print(classification_report(y_test, y_pred_np, target_names=class_names))
print(confusion_matrix(y_test, y_pred_np))

                 precision    recall  f1-score   support

Not Patronizing       0.94      0.54      0.68      2819
 Is Patronizing       0.37      0.89      0.52       852

       accuracy                           0.62      3671
      macro avg       0.66      0.72      0.60      3671
   weighted avg       0.81      0.62      0.65      3671

[[1515 1304]
 [  91  761]]


In [41]:
import pickle
 
# Save the trained model as a pickle string.
modelGaussianNB = pickle.dumps(classifier_np)
import joblib
# Save the model as a pickle in a file
joblib.dump(classifier_np, 'classifier_np.pkl')

['classifier_np.pkl']

In [42]:
# Save the trained model as a pickle string.
pickle.dumps(classifier_xgb)
# Save the model as a pickle in a file
joblib.dump(classifier_xgb, 'classifier_xgb.pkl')

['classifier_xgb.pkl']

In [43]:
# Save the trained model as a pickle string.
pickle.dumps(classifier_svm)
# Save the model as a pickle in a file
joblib.dump(classifier_svm, 'classifier_svm.pkl')

['classifier_svm.pkl']

In [44]:
# Save the trained model as a pickle string.
pickle.dumps(classifier_rf)
# Save the model as a pickle in a file
joblib.dump(classifier_lr, 'classifier_rf.pkl')

['classifier_rf.pkl']

In [45]:
# Save the trained model as a pickle string.
pickle.dumps(classifier_lr)
# Save the model as a pickle in a file
joblib.dump(classifier_lr, 'classifier_lr.pkl')

['classifier_lr.pkl']

In [46]:
# Save the trained model as a pickle string.
pickle.dumps(classifier_knn)
# Save the model as a pickle in a file
joblib.dump(classifier_knn, 'classifier_knn.pkl')

['classifier_knn.pkl']

In [47]:
# Save the trained model as a pickle string.
pickle.dumps(classifier_dt)
# Save the model as a pickle in a file
joblib.dump(classifier_dt, 'classifier_dt.pkl')

['classifier_dt.pkl']

In [48]:
!cp /content/*.pkl -r gdrive/MyDrive/Imbalance_prepare/model/70/