In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from time import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
#Not using stemming as the performance improvement wasn't observed.
#from nltk.stem.porter import *

In [2]:
emails = pd.read_csv('preprocessed1.csv')
em = emails.dropna(axis=0)
em.sample(3)

Unnamed: 0,Subject,Category
13263,damn weather hey oh sooo upset rain thunder sp...,bill williams iii
8381,sun devil fuel ben ron following meeting today...,tw-commercial group
10514,gas daily natural gas intelligence articles el...,california


In [3]:
em['Category'].value_counts()

logistics              1170
tw-commercial group    1150
bill williams iii      1004
california              982
deal discrepancies      878
management              799
calendar                700
esvl                    663
tufco                   604
resumes                 599
e-mail bin              592
ces                     572
online trading          567
junk                    544
junk file               494
ooc                     473
genco-jv_ipo            465
projects                459
corporate               420
archives                419
Name: Category, dtype: int64

In [4]:
def pre_process_text(textArray):
    #If using stemming...
    #stemmer = PorterStemmer()
    wnl = WordNetLemmatizer()
    processed_text = []
    for text in textArray:
        words_list = (str(text).lower()).split()
        final_words = [wnl.lemmatize(word) for word in words_list if word not in stopwords.words('english')]
        #If using stemming...
        #final_words = [stemmer.stem(word) for word in words_list if word not in stopwords.words('english')]
        final_words_str = str((" ".join(final_words)))
        processed_text.append(final_words_str)
    return processed_text

em['Subject'] = pre_process_text(em['Subject'])

KeyboardInterrupt: 

In [5]:
categories = [ 'logistics','tw-commercial group','california','bill williams iii','deal discrepancies','management','calender','esvl','tufco','resumes','e-mail bin','ces','online trading','junk','junk file','ooc','genco','projects','corporate','archives']

In [24]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
]);

In [25]:
# Every additional parameter value here will increase the training time by orders of magnitude. 
# I'm running on a relatively slow computer, hence reduced the values

parameters = {
    'vect__max_df': (0.5, 1.0),#0.6, 0.7, 0.8, 0.9, 1.0),
    'vect__max_features': (None, 1000, 5000),#2000, 3000, 4000, 5000, 6000, 10000, 20000, 30000, 40000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),#, (1, 3)),  # unigrams or bigrams or trigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.1, 0.01, 0.001),#, 0.0001, 0.00001, 0.000001, 0.0000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50),#, 100, 200, 300, 400, 500, 100),
}

In [26]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, refit=True)

print("Grid Search started\n---------------------------------------")
print("Pipeline:", [name for name, _ in pipeline.steps])
print("Grid Search Parameters:")
print(parameters)
t0 = time()
grid_search.fit(np.array(em['Subject']), np.array(em['Category']))
print("done in %0.3fs\n----------------------------------------------" % (time() - t0))

print("Best Score: %0.3f\n-------------------------------------------" % grid_search.best_score_)
print("Best Parameters:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Grid Search started
---------------------------------------
Pipeline: ['vect', 'tfidf', 'clf']
Grid Search Parameters:
{'vect__max_df': (0.5, 1.0), 'vect__max_features': (None, 1000, 5000), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.1, 0.01, 0.001), 'clf__penalty': ('l2', 'elasticnet'), 'clf__max_iter': (10, 50)}
Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 45.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 72.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 88.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 106.0min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 118.7min finished


done in 7131.297s
----------------------------------------------
Best Score: 0.766
-------------------------------------------
Best Parameters:
	clf__alpha: 0.001
	clf__max_iter: 10
	clf__penalty: 'l2'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 1.0
	vect__max_features: None
	vect__ngram_range: (1, 2)


In [5]:
param_grid = {
                'sgdclassifier__learning_rate':['constant','optimal','invscaling'],
                'sgdclassifier__eta0':[0.0,0.01,0.1,0.3,0.5,0.7],
                'sgdclassifier__alpha':[0.0001,0.001,0.01,0.1]}

In [23]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__alpha', 'clf__average', 'clf__class_weight', 'clf__early_stopping', 'clf__epsilon', 'clf__eta0', 'clf__fit_intercept', 'clf__l1_ratio', 'clf__learning_rate', 'clf__loss', 'clf__max_iter', 'clf__n_iter_no_change', 'clf__n_jobs', 'clf__penalty', 'clf__power_t', 'clf__random_state', 'clf__shuffle', 'clf__tol', 'clf__validation_fraction', 'clf__verbose', 'clf__warm_start'])

In [28]:
import pickle
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(grid_search, open(filename, 'wb'))

In [37]:
test_set = [
    'hey there',
    'Coorg trip advice',
    'movie tickets for sale',
    'Advice needed for treatment of hair fall',
    'Moving out sale',
    'RE: Selling Honda City'
]

In [38]:
grid_search.best_estimator_.predict(np.array(test_set))

array(['deal discrepancies', 'calendar', 'logistics', 'calendar',
       'logistics', 'deal discrepancies'], dtype='<U19')

In [6]:
import pickle
# save the model to disk
filename = 'finalized_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [7]:
test_set = [
    'hey there',
    'california',
    'movie tickets for sale',
    'Advice needed for treatment of hair fall',
    'Moving out sale',
    'RE: Selling Honda City'
]

In [8]:
loaded_model.best_estimator_.predict(np.array(test_set))

array(['deal discrepancies', 'california', 'logistics', 'calendar',
       'logistics', 'deal discrepancies'], dtype='<U19')

In [None]:
from flask import Flask, render_template, request
import pickle

app = Flask(__name__,template_folder = r"C:\Users\Deepnil Vasava\Documents\JUPYTER=========\mail") 
                                                                                   
    

loaded_model = pickle.load(open("finalized_model.sav", "rb"))

@app.route('/')

def symptom():
    
   return render_template('text_pred.html')

@app.route('/result',methods = ['POST', 'GET'])
def result():
   if request.method == 'POST':
      result = request.form['Data']
      result_pred = loaded_model.best_estimator_.predict(np.array(result))
      return render_template("text_result.html",result = result_pred)

if __name__ == '__main__':
   app.run(host="localhost", port=5001, debug=False)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://localhost:5000/ (Press CTRL+C to quit)
