In [45]:
import pandas as pd
import re , os
import nltk
from nltk.corpus import stopwords
import pickle
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB , MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


clf_pkl_file = "classifier.pkl"
count_vectorizer_file = "count_vectorizer.pkl"

def create_pickle_file(clf , count_vectorizer):
    save_clf = open(clf_pkl_file , "wb")
    pickle.dump(clf , save_clf)
    save_clf.close()
    save_count_vec = open(count_vectorizer_file , "wb")
    pickle.dump(count_vectorizer , save_count_vec)
    save_count_vec.close()
    
def get_dataset():
    dataset = pd.read_csv('./datasets/training.csv', delimiter=',', quoting=0) #quoting minimul
    print("Dataset Shape " + str(dataset.shape))
    return dataset

def stammer_and_remove_stopwords(dataset):
    filtered_sentence = []
    for i in range(0, len(dataset)):
        comment = re.sub('[^a-zA-Z]', ' ', dataset['Comment'][i])
        comment = comment.lower()
        comment = comment.split()
        ps = PorterStemmer() #  stem the words like increase . increasing and incrimint all are the form of increase word
        comment = [ps.stem(word) for word in comment if not word in set(stopwords.words('english'))] #remove stopwords
        comment = ' '.join(comment)
        filtered_sentence.append(comment)
    return filtered_sentence


def get_classifier_and_count_vectorizer():
    if not os.path.isfile(clf_pkl_file) or not os.path.isfile(count_vectorizer_file):
        dataset = get_dataset()
        filtered_sentence = stammer_and_remove_stopwords(dataset)
        
        count_vectorizer = CountVectorizer(max_features=1500)
        count_vectorizer.fit(filtered_sentence)
        X = count_vectorizer.transform(filtered_sentence).toarray()
        Y = dataset.iloc[:, 0].values
        
        print(X.shape)
        print(Y.shape)

        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=0)

        print(X_train.shape)
        print(Y_train.shape)
        
        print(X_test.shape)
        print(Y_test.shape)
        
        classifier = LogisticRegression(random_state=120)
        classifier.fit(X_train, Y_train) 
        print("Logistic Regrression : " + str(classifier.score(X_test , Y_test)))
        
       
    #         classifier = RandomForestClassifier(max_depth=2, random_state=0)
    #         classifier.fit(X_train, Y_train) 
    #         print("RandomForestClassifier : " + str(classifier.score(X_test , Y_test)))


    #         classifier = MultinomialNB()
    #         classifier.fit(X_train, Y_train) 
    #         print("MultinomialNB : " + str(classifier.score(X_test , Y_test)))


    #         classifier = DecisionTreeClassifier(random_state=0)
    #         classifier.fit(X_train, Y_train) 
    #         print("DecisionTreeClassifier : " + str(classifier.score(X_test , Y_test)))
    # #       DecisionTreeClassifier : 0.740909090909091  

    #         classifier = LinearDiscriminantAnalysis()
    #         classifier.fit(X_train, Y_train) 
    #         print("LinearDiscriminantAnalysis : " + str(classifier.score(X_test , Y_test)))
    # #       LinearDiscriminantAnalysis : 0.7818181818181819

    #         classifier = SVC()
    #         classifier.fit(X_train, Y_train) 
    #         print("SVC : " + str(classifier.score(X_test , Y_test)))
    # #       SVC : 0.7575757575757576

    #         classifier = GaussianNB()
    #         classifier.fit(X_train, Y_train) 
    #         print("GaussianNB : " + str(classifier.score(X_test , Y_test)))
    # #         GaussianNB : 0.4621212121212121
        
        
        
        create_pickle_file(classifier ,count_vectorizer )
    else:
        clf_file = open(clf_pkl_file, 'rb') 
        classifier = pickle.load(clf_file)
        count_vec_file =  open(count_vectorizer_file, 'rb')
        count_vectorizer = pickle.load(count_vec_file) 

    return classifier , count_vectorizer
    
    
        
def start_comment_analysis(comment):
    classifier , count_vectorizer = get_classifier_and_count_vectorizer()
    comment = re.sub('[^a-zA-Z]', ' ', comment)
    comment = comment.lower()
    comment = comment.split()
    ps = PorterStemmer()
    comment = [ps.stem(word) for word in comment if not word in set(stopwords.words('english'))]
    comment = ' '.join(comment)
    x = count_vectorizer.transform([comment])
    pred = classifier.predict((x))
    if pred[0]:
        print("Insult Found !!!!!!!!!!!!!!")
    else:
        print("No Insult !!!!!!!!!!!")

start_comment_analysis("shut the fuck up")
start_comment_analysis("you are good")
start_comment_analysis("Even as a troll you are a pathetic failure.")
start_comment_analysis("You with the 'racist' screen name\n\nYou are a PieceOfShit..........")
start_comment_analysis("And you know they've burned holes in all the carpeting.")

    

Insult Found !!!!!!!!!!!!!!
No Insult !!!!!!!!!!!
Insult Found !!!!!!!!!!!!!!
Insult Found !!!!!!!!!!!!!!
No Insult !!!!!!!!!!!
