In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
orig_df = pd.read_csv('labeled_data_100k.csv')
orig_df.head()

Unnamed: 0,from,to,subject,body,is_useful
0,phillip.allen@enron.com,tim.belden@enron.com,,Here is our forecast,0
1,phillip.allen@enron.com,john.lavorato@enron.com,Re,Traveling to have a business meeting takes the...,0
2,phillip.allen@enron.com,leah.arsdall@enron.com,Re,test successful. way to go!!!,0
3,phillip.allen@enron.com,randall.gay@enron.com,,"Randy,Can you send me a schedule of the salary...",1
4,phillip.allen@enron.com,greg.piper@enron.com,Re,,0


In [3]:
df = orig_df.rename(columns={"body": "Body", "is_useful": "Useful"}, errors="raise")
df = df[['Body', 'Useful']]
df['category_id'] = df['Useful']
df['Body'] = df['Body'].fillna("no body")
df.head()

Unnamed: 0,Body,Useful,category_id
0,Here is our forecast,0,0
1,Traveling to have a business meeting takes the...,0,0
2,test successful. way to go!!!,0,0
3,"Randy,Can you send me a schedule of the salary...",1,1
4,no body,0,0


In [4]:
from io import StringIO

def set_status(key):
    if key == 1:
        return "Useful"
    elif key == 0:
        return "Not-Useful"

col = ['Useful', 'Body']
df = df[col]
df = df[pd.notnull(df['Body'])]
df.columns = ['Useful', 'Body']
df['category_id'] = df['Useful']
df['Useful'] = df['category_id'].apply(set_status)
category_id_df = df[['Useful', 'category_id']].sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Useful']].values)
df = df.head(3000)

<h2>Using Linear Regression SVC</h2>

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [6]:
labels = df['Useful']
text = df['Body']

X_train, X_test, y_train, y_test = train_test_split(text, labels, random_state=0, test_size=0.3)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)

X_test_counts = count_vect.transform(X_test)
X_test_transformed = tf_transformer.transform(X_test_counts)

labels = LabelEncoder()
y_train_labels_fit = labels.fit(y_train)
y_train_lables_trf = labels.transform(y_train)

print(labels.classes_)

['Not-Useful' 'Useful']


In [7]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

linear_svc = LinearSVC()
clf = linear_svc.fit(X_train_transformed,y_train_lables_trf)

calibrated_svc = CalibratedClassifierCV(base_estimator=linear_svc,
                                        cv="prefit")

calibrated_svc.fit(X_train_transformed,y_train_lables_trf)
predicted = calibrated_svc.predict(X_test_transformed)    # predict on X_test
    
print('Average accuracy on test set={}'.format(np.mean(predicted == labels.transform(y_test))))

Average accuracy on test set=0.9744444444444444


<h3>Testing Manually Labeled Data | Count : 90</h3>

In [8]:
test_90 = pd.read_csv('test_90.csv')
test_90.head()

Unnamed: 0,Useful,Body,email_subject
0,1.0,"""meeting tmr. join webex""","""cisco hackathon meeting"""
1,1.0,"""You are part of the Group 1 student activatio...","""Important Notification"""
2,0.0,"""The wait is over—your Robinhood tax document ...","""Ready to download"""
3,0.0,"""Hi, We’ve recently launched a new Google Pay ...",
4,0.0,"""I am sending the documents now.""","""Re: Benefits"""


In [9]:
# a method to remove the double quotes from start and end
def remove_quotes(text):
    if text is np.nan: 
        return "no subject"
    if text != None and type(text) != float:
        return text[1:len(text)-1]
    return text

In [10]:
# clean up the data
test_90['Body'] = test_90['Body'].apply(remove_quotes)
test_90['email_subject'] = test_90['email_subject'].apply(remove_quotes)
test_90['Useful'] = test_90['Useful'].fillna(0.0).astype(int)
test_90.head()

Unnamed: 0,Useful,Body,email_subject
0,1,"meeting tmr. join webex""",cisco hackathon meeting
1,1,You are part of the Group 1 student activation...,Important Notification
2,0,The wait is over—your Robinhood tax document i...,Ready to download
3,0,"Hi, We’ve recently launched a new Google Pay a...",no subject
4,0,I am sending the documents now.,Re: Benefits


In [11]:
body_texts = list(test_90['Body'])

In [12]:
y_true = list(test_90['Useful'])
y_pred = []
count = 0

for idx, body_text in zip(range(len(body_texts)), body_texts):
    p_count = count_vect.transform([body_text])
    p_tfidf = tf_transformer.transform(p_count)
    y_predict = calibrated_svc.predict(p_tfidf)
    y_pred.extend(y_predict)
    if y_predict[0] == y_true[idx]:
        count = count + 1
acc = (count / len(y_true)) * 100, "% Accuracy"
print("Correctly predicted: ", count, " out of ", len(y_true))
print("Accuracy: ", acc)


Correctly predicted:  74  out of  91
Accuracy:  (81.31868131868131, '% Accuracy')
