In [29]:
import numpy as np
import pandas as pd
import regex as re
import pickle
import nltk
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

In [30]:
nltk.download('punkt')  
nltk.download('stopwords')
nltk.download('words')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [31]:
Start = time.time()
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

In [32]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stopWords = stopwords.words('english')

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

train_text = train["text"]
test_text = test["text"]
train_text = train_text.apply(lemmatize_text) 
test_text = test_text.apply(lemmatize_text)
train_text = train_text.apply(lambda x: ' '.join(([word for word in x])))
test_text = test_text.apply(lambda x: ' '.join(([word for word in x])))
train["text"] = train_text
test["text"] = test_text
del train_text, test_text

In [33]:
temporary_training_data = []
temporary_testing_data = []
train_comments_list = train["text"].tolist()
test_comments_list = test["text"].tolist()

sensibleWords = {
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    ":-(": " frown ",
    ":(": " frown ",
    ":s": " frown ",
    ":-s": " frown ",
    ":/": " bad ",
    ":>": " sad ",
    ":')": " sad ",
    "<3": " heart ",
    ":/": " worry ",
    ":>": " angry ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
}

nonsenseWords = [word for word in sensibleWords.keys()] 

for comment in train_comments_list:
    words = str(comment).split()
    sentense = ""
    for word in words:
        if word[:4] == 'http' or word[:3] == 'www': 
            continue                             
        if word in nonsenseWords:
            word = sensibleWords[word]
        sentense += word + " " 
    temporary_training_data.append(sentense)

for comment in test_comments_list:
    words = str(comment).split()
    sentense = ""
    for word in words:
        if word[:4] == 'http' or word[:3] == 'www':  
            continue                              
        if word in nonsenseWords:
            word = sensibleWords[word]
        sentense += word + " "  
    temporary_testing_data.append(sentense)


for i, c in enumerate(temporary_training_data):
    temporary_training_data[i] = re.sub('[^a-zA-Z ?!]+', '', temporary_training_data[i])

for i, c in enumerate(temporary_testing_data):
    temporary_testing_data[i] = re.sub('[^a-zA-Z ?!]+', '', temporary_testing_data[i]) 
    
train["text"] = temporary_training_data
test["text"] = temporary_testing_data
del temporary_training_data, temporary_testing_data


In [34]:
train_text = train["text"]
test_text = test["text"]
complete_text = pd.concat([train["text"], test["text"]]) 

In [35]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf = True, 
    strip_accents = 'unicode',
    analyzer = 'word',
    token_pattern = '(?u)\\b\\w\\w+\\b\\w{,1}',
    lowercase = False, 
    stop_words = 'english',  
    ngram_range = (1, 2), 
    min_df = 2,
    max_df = 0.5,
    norm = 'l2',
    max_features = 30000
) 
word_vectorizer.fit(complete_text) 
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)


In [36]:
char_vectorizer = TfidfVectorizer (
    sublinear_tf = True,
    strip_accents = 'unicode', 
    analyzer = 'char',
    ngram_range = (2, 6), 
    min_df = 2, 
    max_df = 0.5,
    max_features = 20000
)
char_vectorizer.fit(complete_text) 
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [37]:
train_features = hstack([train_char_features, train_word_features]) 
test_features = hstack([test_char_features, test_word_features])


In [38]:
categories = ['harsh', 'extremely_harsh', 'vulgar', 'threatening', 'disrespect', 'targeted_hate']
temp = train_features.tocsr()



In [50]:
cross_validation_scores = []
model_storage = open('model_storage.pckl', 'wb')

for category in categories:
    train_target = train[category]
    ridgeClassifier = Ridge(solver = 'sag', max_iter = 50, fit_intercept = True, tol = 0.001, alpha = 70, copy_X = True, random_state = 0)
    cv_score = np.mean(cross_val_score(ridgeClassifier, train_features, train_target, cv = 50, scoring = 'roc_auc'))
    cross_validation_scores.append(cv_score)
    ridgeClassifier.fit(train_features, train_target)
    pickle.dump(ridgeClassifier, model_storage)

model_storage.close()
print('Cross Validation score is {}'.format(np.mean(cross_validation_scores)))

Cross Validation score is 0.9826927602412847


In [46]:
Output = pd.DataFrame.from_dict({'id': test['id']})
models = []

with open("model_storage.pckl", "rb") as file:
    while True:
        try:
            models.append(pickle.load(file))
        except EOFError:
            break

i=0  
for category in categories:
    train_target = train[category]
    Output[category] = models[i].predict(test_features)
    i=i+1

Output.to_csv('FinalSubmission.csv', index = False)

End = time.time()
print('Time of execution: {} minutes'.format((End - Start) / 60)) 

Time of execution: 192.60360910495123 minutes
