In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#to data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#NLP tools
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

#train split and fit models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#model selection
from sklearn.metrics import confusion_matrix, accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!cp gdrive/MyDrive/prepared_DPM_preprocessing3.pkl .
import pandas as pd

In [4]:
dataset = pd.read_pickle("prepared_DPM_preprocessing3.pkl")
dataset.head(10)

Unnamed: 0,par_id,sentence,label
7051,7799,i m not proud of what happened over the weeke...,0
1490,1676,cbpr methods can increase our understanding o...,0
2216,9126,in big cities most of the country is rural an...,1
5373,1824,i am climbing all these summits to raise fund...,1
4108,763,economies that could have otherwise done well...,1
1327,4023,her compassionate outreach her topic is mercy...,1
8284,9157,should the skeleton be positively identified a...,0
6412,7094,the irish immigrant also faced a charge of att...,0
9005,9952,while there are an estimated 11 million people...,0
5963,6601,one reason senator sessions said is that immig...,0


In [5]:
dataset.shape

(17756, 3)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17756 entries, 7051 to 6270
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   par_id    17756 non-null  object
 1   sentence  17756 non-null  object
 2   label     17756 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 554.9+ KB


In [7]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,17756.0,0.466321,0.498879,0.0,0.0,0.0,1.0,1.0


In [8]:
dt_trasformed = dataset[['label', 'sentence']]
y = dt_trasformed.iloc[:, :-1].values

In [9]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
y = np.array(ct.fit_transform(y))

In [10]:
y[:10]

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [11]:
print(y)

[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [12]:
y_df = pd.DataFrame(y)

In [13]:
np.array(y_df[0])

array([1., 1., 0., ..., 1., 1., 1.])

In [14]:
np.array(y_df[1])

array([0., 0., 1., ..., 0., 0., 0.])

In [15]:
dataset['sentence'].values

array([' i m not proud of what happened over the weekend and i wish to extend my apology to women all over south africa and the world there are better ways to deal with disputes and disagreements ',
       ' cbpr methods can increase our understanding of risk and protective factors in refugee and immigrant communities and contribute to development of culturally appropriate programming to address health disparities shirazi said via email my use of cbpr has taken an innovative approach to addressing health disparities and building evidencebased community health programming ',
       ' in big cities most of the country is rural and most of the population is illiterate and hopeless',
       ...,
       'presently what aspect of smart cities do you think is most vulnerable ',
       'like many in sao paulo s army of the homeless she sees her squat not as something illegal but as a real fight for housing ',
       'many of the devices are vulnerable based on siemens alerts and do not have th

In [16]:
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(dataset['sentence'].values).toarray()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y_df[1], test_size = 0.20, random_state = 0)

Finding the best models to predict 

Naive Bayes



In [18]:
classifier_np = GaussianNB()
classifier_np.fit(X_train, y_train)

GaussianNB()

Decision Tree

In [19]:
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

KNN

In [20]:
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

KNeighborsClassifier()

Logistic Regression



In [21]:
classifier_lr = LogisticRegression(random_state = 0)
classifier_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=0)

Random Forest



In [22]:
classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

SVM Classifier



In [23]:
classifier_svm = svm.SVC()
classifier_svm.fit(X_train, y_train)

SVC()

XGBoost Classifier



In [24]:
classifier_xgb= XGBClassifier()
classifier_xgb.fit(X_train, y_train)

XGBClassifier()

Making the Confusion Matrix for each model


In [25]:
#Naive Bayes
y_pred_np = classifier_np.predict(X_test)
cm = confusion_matrix(y_test, y_pred_np)
print(cm)

[[1313  610]
 [ 147 1482]]


In [26]:
#KNN
y_pred_knn = classifier_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred_knn)
print(cm)

[[ 773 1150]
 [  89 1540]]


In [27]:
#XGBoost Classifier
y_pred_xgb = classifier_xgb.predict(X_test)
cm = confusion_matrix(y_test, y_pred_xgb)
print(cm)

[[1512  411]
 [ 368 1261]]


In [28]:
#SVM
y_pred_svm = classifier_svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred_svm)
print(cm)

[[1725  198]
 [ 119 1510]]


In [29]:
#Logistic Regression
y_pred_lr=classifier_lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred_lr)
print(cm)

[[1639  284]
 [ 163 1466]]


In [30]:
#Decision Tree
y_pred_dt = classifier_dt.predict(X_test)
cm = confusion_matrix(y_test, y_pred_dt)
print(cm)

[[1577  346]
 [ 158 1471]]


In [31]:
#Random Florest
y_pred_rf = classifier_rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rf)
print(cm)

[[1803  120]
 [ 137 1492]]


In [32]:
rf_score = accuracy_score(y_test, y_pred_rf)
knn_score = accuracy_score(y_test, y_pred_knn)
svm_score = accuracy_score(y_test, y_pred_svm)
xgb_score = accuracy_score(y_test, y_pred_xgb)
lr_score = accuracy_score(y_test, y_pred_lr)
dt_score = accuracy_score(y_test, y_pred_dt)
np_score = accuracy_score(y_test, y_pred_np)

print('Random Forest Accuracy: ', str(rf_score))
print('K Nearest Neighbours Accuracy: ', str(knn_score))
print('Support Vector Machine Accuracy: ', str(svm_score))
print('XGBoost Classifier Accuracy: ', str(xgb_score))
print('Logistic Regression Accuracy: ',str(lr_score))
print('Decision Tree Accuracy: ', str(dt_score))
print('Naive Bayes Accuracy: ', str(np_score))

Random Forest Accuracy:  0.9276463963963963
K Nearest Neighbours Accuracy:  0.6511824324324325
Support Vector Machine Accuracy:  0.9107545045045045
XGBoost Classifier Accuracy:  0.7806869369369369
Logistic Regression Accuracy:  0.8741554054054054
Decision Tree Accuracy:  0.8581081081081081
Naive Bayes Accuracy:  0.7868806306306306


In [33]:
from sklearn.metrics import classification_report, confusion_matrix
class_names = ['Not Patronizing','Is Patronizing']
print(classification_report(y_test, y_pred_rf, target_names=class_names))
print(confusion_matrix(y_test, y_pred_rf))

                 precision    recall  f1-score   support

Not Patronizing       0.93      0.94      0.93      1923
 Is Patronizing       0.93      0.92      0.92      1629

       accuracy                           0.93      3552
      macro avg       0.93      0.93      0.93      3552
   weighted avg       0.93      0.93      0.93      3552

[[1803  120]
 [ 137 1492]]


In [34]:
print(classification_report(y_test, y_pred_knn, target_names=class_names))
print(confusion_matrix(y_test, y_pred_knn))

                 precision    recall  f1-score   support

Not Patronizing       0.90      0.40      0.56      1923
 Is Patronizing       0.57      0.95      0.71      1629

       accuracy                           0.65      3552
      macro avg       0.73      0.67      0.63      3552
   weighted avg       0.75      0.65      0.63      3552

[[ 773 1150]
 [  89 1540]]


In [35]:
print(classification_report(y_test, y_pred_svm, target_names=class_names))
print(confusion_matrix(y_test, y_pred_svm))

                 precision    recall  f1-score   support

Not Patronizing       0.94      0.90      0.92      1923
 Is Patronizing       0.88      0.93      0.91      1629

       accuracy                           0.91      3552
      macro avg       0.91      0.91      0.91      3552
   weighted avg       0.91      0.91      0.91      3552

[[1725  198]
 [ 119 1510]]


In [36]:
print(classification_report(y_test, y_pred_xgb, target_names=class_names))
print(confusion_matrix(y_test, y_pred_xgb))

                 precision    recall  f1-score   support

Not Patronizing       0.80      0.79      0.80      1923
 Is Patronizing       0.75      0.77      0.76      1629

       accuracy                           0.78      3552
      macro avg       0.78      0.78      0.78      3552
   weighted avg       0.78      0.78      0.78      3552

[[1512  411]
 [ 368 1261]]


In [37]:
print(classification_report(y_test, y_pred_lr, target_names=class_names))
print(confusion_matrix(y_test, y_pred_lr))

                 precision    recall  f1-score   support

Not Patronizing       0.91      0.85      0.88      1923
 Is Patronizing       0.84      0.90      0.87      1629

       accuracy                           0.87      3552
      macro avg       0.87      0.88      0.87      3552
   weighted avg       0.88      0.87      0.87      3552

[[1639  284]
 [ 163 1466]]


In [38]:
print(classification_report(y_test, y_pred_dt, target_names=class_names))
print(confusion_matrix(y_test, y_pred_dt))

                 precision    recall  f1-score   support

Not Patronizing       0.91      0.82      0.86      1923
 Is Patronizing       0.81      0.90      0.85      1629

       accuracy                           0.86      3552
      macro avg       0.86      0.86      0.86      3552
   weighted avg       0.86      0.86      0.86      3552

[[1577  346]
 [ 158 1471]]


In [39]:
print(classification_report(y_test, y_pred_np, target_names=class_names))
print(confusion_matrix(y_test, y_pred_np))

                 precision    recall  f1-score   support

Not Patronizing       0.90      0.68      0.78      1923
 Is Patronizing       0.71      0.91      0.80      1629

       accuracy                           0.79      3552
      macro avg       0.80      0.80      0.79      3552
   weighted avg       0.81      0.79      0.79      3552

[[1313  610]
 [ 147 1482]]
