In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
def read_data(path, value):
    sentences = []
    targets = []

    with open(path, 'r') as f:
        for line in f:
            sentences.append(line.strip())
            targets.append(value)
    df = pd.DataFrame({'sentences': sentences, 'targets': targets})
    return df

In [52]:
clean_df = read_data("clean.txt", 0)
clean_df.head()

Unnamed: 0,sentences,targets
0,"Hey, do you have any tips for studying for the...",0
1,I'm really struggling to understand this conce...,0
2,I'm having trouble with this homework assignme...,0
3,I'm trying to form a study group for this clas...,0
4,I'm thinking about signing up for a tutor to h...,0


In [53]:
cheat_df = read_data("cheat.txt", 1)
cheat_df.head()

Unnamed: 0,sentences,targets
0,Can you give me the answers to this homework a...,1
1,"I'm struggling with this test, can you help me...",1
2,Do you know where I can find the answers to th...,1
3,"I'm really stuck on this homework, can you jus...",1
4,Is there any way you can send me the answers t...,1


In [54]:
df = pd.concat([cheat_df, clean_df], ignore_index=True)

In [55]:
df[:5]

Unnamed: 0,sentences,targets
0,Can you give me the answers to this homework a...,1
1,"I'm struggling with this test, can you help me...",1
2,Do you know where I can find the answers to th...,1
3,"I'm really stuck on this homework, can you jus...",1
4,Is there any way you can send me the answers t...,1


In [56]:
df[-5:]

Unnamed: 0,sentences,targets
235,Have you started studying for midterms yet?,0
236,I'm thinking about getting involved in communi...,0
237,Have you started thinking about your post-grad...,0
238,I'm thinking about joining a study abroad prog...,0
239,How do you think about your post-graduation pl...,0


In [57]:
from sklearn.utils import shuffle
df = shuffle(df)

In [58]:
df.head()

Unnamed: 0,sentences,targets
227,Have you decided on a major yet?,0
37,"I don't understand this material, can you just...",1
107,"I don't understand this concept, can you give ...",1
113,I'm really behind in this course and I don't h...,1
99,"I'm really struggling with this homework, can ...",1


In [59]:
df['targets'].value_counts()

0    120
1    120
Name: targets, dtype: int64

In [60]:
df.isnull().sum()

sentences    0
targets      0
dtype: int64

In [61]:
def clean_text(text):
    text = text.lower()
    text = text.strip()
    text=re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]',' ',text)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text)
    text = re.sub(r'\s+',' ',text)

    return text

In [62]:
test_text = "sdt[9] , hahaha...... &yiu"
print(clean_text(test_text))

sdt hahaha yiu


In [63]:
def stop_words(text):
    return ' '.join(word for word in text.split() if word.lower() not in stopwords.words('english'))

In [64]:
test_text = "the peeyush a ball boy he is very handsome and good hahahahahhahahahahahhaha"
print(stop_words(test_text))

peeyush ball boy handsome good hahahahahhahahahahahhaha


In [65]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

stemmer = SnowballStemmer('english')
def snow_stemmer(text):
  return ' '.join(stemmer.stem(word) for word in word_tokenize(text))

In [66]:
test_text = "Stemming example with SnowballStemmer"
print(snow_stemmer(test_text))

stem exampl with snowballstemm


In [67]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def wordnet_lemmatizer(text):
  return ' '.join(lemmatizer.lemmatize(word) for word in word_tokenize(text))

In [68]:
test_text = "lemmatizing example with WordNetLemmatizer"
print(wordnet_lemmatizer(test_text))

lemmatizing example with WordNetLemmatizer


In [69]:
def preprocess(text):
  return wordnet_lemmatizer(snow_stemmer(stop_words(clean_text(text))))

In [70]:
test_text = "  test sd e%: , ?, ''  a     .  testing preprocessing with preprocess"
print(preprocess(test_text))

test sd e test preprocess preprocess


In [71]:
df['clean_sentences']=df['sentences'].apply(lambda sentence: preprocess(sentence))

In [72]:
df.head()

Unnamed: 0,sentences,targets,clean_sentences
227,Have you decided on a major yet?,0,decid major yet
37,"I don't understand this material, can you just...",1,understand materi give answer move
107,"I don't understand this concept, can you give ...",1,understand concept give answer move next topic
113,I'm really behind in this course and I don't h...,1,realli behind cours time work give answer catc...
99,"I'm really struggling with this homework, can ...",1,realli struggl homework give answer finish move


In [99]:
X_train, X_val, y_train, y_val = train_test_split(df["clean_sentences"],df["targets"],test_size=0.2, shuffle=True)

In [100]:
len(X_train), len(X_val), len(y_train), len(y_val)

(192, 48, 192, 48)

In [101]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors = tfidf_vectorizer.fit_transform(X_train)
X_val_vectors = tfidf_vectorizer.transform(X_val)

In [102]:
model_lr = LogisticRegression()
model_lr.fit(X_train_vectors, y_train)

In [103]:
y_predict = model_lr.predict(X_val_vectors)
y_prob = model_lr.predict_proba(X_val_vectors)[:,1]

In [104]:
print(classification_report(y_val, y_predict))
print('Confusion Matrix:', confusion_matrix(y_val, y_predict))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        25
           1       1.00      0.91      0.95        23

    accuracy                           0.96        48
   macro avg       0.96      0.96      0.96        48
weighted avg       0.96      0.96      0.96        48

Confusion Matrix: [[25  0]
 [ 2 21]]


In [105]:
fpr, tpr, thresholds = roc_curve(y_val, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

AUC: 0.9982608695652173


In [106]:
# since perfect performance
model = model_lr

In [107]:
def predict_sentence(sentence):
    sentence = preprocess(sentence)
    X_vector = tfidf_vectorizer.transform([sentence])  # convert the input sentence to a vector
    y_predict = model.predict(X_vector)  # use the trained model to make a prediction
    y_prob = model.predict_proba(X_vector)[:, 1]  # get the probability of the prediction
    return y_predict, y_prob

In [115]:
predict_sentence('how was your test?')

(array([0]), array([0.42861351]))