In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import string as st
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk import PorterStemmer, WordNetLemmatizer

In [None]:
data = pd.read_csv("/Users/omar.hassan/Documents/DAEN/DAEN 690/dss_cleanv2.csv")

In [None]:
data = data.convert_dtypes()
data.info()

In [None]:
data = data.dropna(subset = ['comments'])

In [None]:
data['complaint'].fillna('Yes', inplace=True)

In [None]:
# Check how the labels are distributed
print(np.unique(data['complaint']))
print(np.unique(data['complaint'].value_counts()))

In [None]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [None]:
data['removed_punc'] = data['comments'].apply(lambda x: remove_punct(x))
data.head()

In [None]:
''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
    on special characters, tabs or any other string based on which text is to be seperated into tokens.
'''
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [None]:
data['tokens'] = data['removed_punc'].apply(lambda msg : tokenize(msg))
data.head()

In [None]:
# Remove tokens of length less than 3
def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [None]:
data['filtered_tokens'] = data['tokens'].apply(lambda x : remove_small_words(x))
data.head()

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [None]:
''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
    list could be created and used to limit the matches in input text. 
'''
def remove_stopwords(text):
    return [word for word in text if word not in stopwords.words('english')]

In [None]:
data['clean_tokens'] = data['filtered_tokens'].apply(lambda x : remove_stopwords(x))
data.head()

In [None]:
# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [None]:
data['lemma_words'] = data['clean_tokens'].apply(lambda x : lemmatize(x))
data.head()

In [None]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [None]:
data['clean_text'] = data['lemma_words'].apply(lambda x : return_sentences(x))
data.head()

In [None]:
# Generate a basic word cloud 
from wordcloud import WordCloud, ImageColorGenerator

text = " ".join([x for x in data['clean_text']])
# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=30, max_words=1000).generate(text)

# Display the generated image:
plt.figure(figsize= [20,10])
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
# Prepare data for the model. Convert label in to binary

data['complaint'] = [1 if x == 'Yes' else 0 for x in data['complaint']]
data.head()


In [None]:
# Split the dataset

X_train,X_test,y_train,y_test = train_test_split(data['clean_text'], data['complaint'], test_size=0.2, random_state = 5)

print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)

print(tfidf_train.toarray())
print(tfidf_train.shape)
print(tfidf_test.toarray())
print(tfidf_test.shape)

In [None]:
# Logistic Regression model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 500)
lr.fit(tfidf_train, y_train)
print('Logistic Regression model fitted..')

ypred = lr.predict(tfidf_test)
print("Accuracy score : {}".format(accuracy_score(y_test, ypred)))
print("Confusion matrix : \n {}".format(confusion_matrix(y_test, ypred)))

In [None]:
lr_accuracy = accuracy_score(y_test,ypred)
print('Accuracy:',lr_accuracy)

# passing actual and predicted values
lr_cm = confusion_matrix(y_test, ypred)

# true Write data values in each cell of the matrix
plt.figure(figsize = (15,8))
sns.heatmap(lr_cm, annot=True, fmt='.0f')
plt.savefig('confusion.png')

lr_cr = classification_report(y_test, ypred)
print('Classification Report:')
print (lr_cr)

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier()
knn_clf.fit(tfidf_train,y_train)
ypred=knn_clf.predict(tfidf_test) #These are the predicted output values

In [None]:
knn_accuracy = accuracy_score(y_test,ypred)
print('Accuracy:',knn_accuracy)

# passing actual and predicted values
knn_cm = confusion_matrix(y_test, ypred)

# true Write data values in each cell of the matrix
plt.figure(figsize = (15,8))
sns.heatmap(knn_cm, annot=True, fmt='.0f')
plt.savefig('confusion.png')

knn_cr = classification_report(y_test, ypred)
print('Classification Report:')
print (knn_cr)

In [None]:
# import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# create a Gaussian Classifier
classifer = GaussianNB()

# train the model using the training sets
classifer.fit(tfidf_train.toarray(), y_train)

# predict the response for test dataset
y_pred = classifer.predict(tfidf_test.toarray())

In [None]:
nb_accuracy = accuracy_score(y_test,y_pred)
print('Accuracy:',nb_accuracy)

# passing actual and predicted values
nb_cm = confusion_matrix(y_test, y_pred)

# true Write data values in each cell of the matrix
plt.figure(figsize = (15,8))
sns.heatmap(nb_cm, annot=True, fmt='.0f')
plt.savefig('confusion.png')

nb_cr = classification_report(y_test, y_pred)
print('Classification Report:')
print (nb_cr)