In [64]:
!which python

/anaconda3/bin/python


In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import datetime

In [66]:
df=pd.read_csv('training_data/english/agr_en_train.csv',sep=',',names=['id','text','result'])

In [67]:
df.head()

Unnamed: 0,id,text,result
0,facebook_corpus_msr_1723796,Well said sonu..you have courage to stand agai...,OAG
1,facebook_corpus_msr_466073,"Most of Private Banks ATM's Like HDFC, ICICI e...",NAG
2,facebook_corpus_msr_1493901,"Now question is, Pakistan will adhere to this?",OAG
3,facebook_corpus_msr_405512,Pakistan is comprised of fake muslims who does...,OAG
4,facebook_corpus_msr_1521685,"??we r against cow slaughter,so of course it w...",NAG


In [68]:
len(df)

11999

In [69]:
df.loc[df['result']=='NAG','result']=0

In [70]:
df.loc[df['result']=='CAG','result']=50
df.loc[df['result']=='OAG','result']=100

In [71]:
df.head()

Unnamed: 0,id,text,result
0,facebook_corpus_msr_1723796,Well said sonu..you have courage to stand agai...,100
1,facebook_corpus_msr_466073,"Most of Private Banks ATM's Like HDFC, ICICI e...",0
2,facebook_corpus_msr_1493901,"Now question is, Pakistan will adhere to this?",100
3,facebook_corpus_msr_405512,Pakistan is comprised of fake muslims who does...,100
4,facebook_corpus_msr_1521685,"??we r against cow slaughter,so of course it w...",0


In [72]:
df_x=df.text
df_y=df.result

In [73]:
tfidf = TfidfVectorizer(min_df=1,stop_words='english')

In [74]:
tf_transformer=tfidf.fit(df_x)

In [75]:
len(tf_transformer.vocabulary_)

21467

In [76]:
tf_transformer_cv=tfidf.transform(df_x)

In [77]:
tfidf.vocabulary_

{'said': 16801,
 'sonu': 18003,
 'courage': 5000,
 'stand': 18267,
 'dadagiri': 5284,
 'muslims': 13019,
 'private': 15140,
 'banks': 2502,
 'atm': 2083,
 'like': 11480,
 'hdfc': 8872,
 'icici': 9395,
 'cash': 3779,
 'public': 15379,
 'sector': 17154,
 'bank': 2492,
 'working': 20986,
 'question': 15541,
 'pakistan': 14069,
 'adhere': 1133,
 'comprised': 4613,
 'fake': 7283,
 'does': 6255,
 'know': 11000,
 'meaning': 12277,
 'unity': 20010,
 'imposes': 9560,
 'thoughts': 19284,
 'rascals': 15782,
 'gathered': 8053,
 'cow': 5019,
 'slaughter': 17821,
 'course': 5003,
 'stop': 18392,
 'leather': 11335,
 'manufacturing': 12075,
 'happens': 8730,
 'wondering': 20968,
 'educated': 6617,
 'ambassador': 1548,
 'struggling': 18461,
 'pay': 14319,
 'credit': 5092,
 'debit': 5415,
 'decent': 5434,
 'restaurant': 16312,
 'imagine': 9502,
 'diplomat': 6005,
 'developed': 5812,
 'nation': 13180,
 'having': 8845,
 'card': 3741,
 'needs': 13276,
 'dinner': 5995,
 'inflation': 9747,
 'react': 15848,
 

In [78]:
tf_transformer_cv.shape

(11999, 21467)

In [79]:
x_traincv=tfidf.fit_transform(df_x)

In [80]:
lr=LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')

In [81]:
lr.fit(x_traincv,df_y.astype('int'))



LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=None, penalty='l2',
          random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False)

In [82]:
df_test=pd.read_csv('training_data/english/agr_en_dev.csv',names=['id','text','result'])

In [83]:
df_test.head()

Unnamed: 0,id,text,result
0,facebook_corpus_msr_451811,The quality of re made now makes me think it i...,CAG
1,facebook_corpus_msr_334368,@siva \nHow is ur mother???\nHow is ur wife???...,NAG
2,facebook_corpus_msr_331195,Also see ....hw ur RSS activist caught in Burk...,NAG
3,facebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING\n...,NAG
4,facebook_corpus_msr_379239,Modi ho ya Manmohan singh saala yeh log kuch n...,OAG


In [84]:
df_test.loc[df_test['result']=='NAG','result']=0
df_test.loc[df_test['result']=='CAG','result']=50
df_test.loc[df_test['result']=='OAG','result']=100

In [85]:
df_test.head()

Unnamed: 0,id,text,result
0,facebook_corpus_msr_451811,The quality of re made now makes me think it i...,50
1,facebook_corpus_msr_334368,@siva \nHow is ur mother???\nHow is ur wife???...,0
2,facebook_corpus_msr_331195,Also see ....hw ur RSS activist caught in Burk...,0
3,facebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING\n...,0
4,facebook_corpus_msr_379239,Modi ho ya Manmohan singh saala yeh log kuch n...,100


In [86]:
x_testcv=tfidf.transform(df_test['text'])

In [87]:
pred=lr.predict(x_testcv)

In [88]:
y_test=df_test['result'].astype('int')

In [89]:
lr.score(x_testcv,y_test)

0.5188270576474509

In [90]:
now=datetime.datetime.now()
now=str(now).replace('-','_').replace(' ','_').replace(':','_').split('.')[0]
print(now)

model_filename='lr_multiclass_{}.smamodel'.format(str(now))
vectorizer_filename='tfidf_{}.smavec'.format(str(now))

print(model_filename)
print(vectorizer_filename)

model_file_ptr=open(model_filename,'wb')
vectorizer_file_ptr=open(vectorizer_filename,'wb')

pickle.dump(lr,model_file_ptr)
pickle.dump(tf_transformer.vocabulary_,vectorizer_file_ptr)

model_file_ptr.close()
vectorizer_file_ptr.close()

2019_06_30_20_28_34
lr_multiclass_2019_06_30_20_28_34.smamodel
tfidf_2019_06_30_20_28_34.smavec


In [91]:
loaded_model = pickle.load(open(model_filename, 'rb'))
loaded_vocab=pickle.load(open(vectorizer_filename,'rb'))
# tfidf_loaded=TfidfVectorizer(vocabulary=loaded_vec)

In [92]:
tf_loaded=TfidfVectorizer(vocabulary=loaded_vocab)

In [93]:
tf_loaded.vocabulary

{'said': 16801,
 'sonu': 18003,
 'courage': 5000,
 'stand': 18267,
 'dadagiri': 5284,
 'muslims': 13019,
 'private': 15140,
 'banks': 2502,
 'atm': 2083,
 'like': 11480,
 'hdfc': 8872,
 'icici': 9395,
 'cash': 3779,
 'public': 15379,
 'sector': 17154,
 'bank': 2492,
 'working': 20986,
 'question': 15541,
 'pakistan': 14069,
 'adhere': 1133,
 'comprised': 4613,
 'fake': 7283,
 'does': 6255,
 'know': 11000,
 'meaning': 12277,
 'unity': 20010,
 'imposes': 9560,
 'thoughts': 19284,
 'rascals': 15782,
 'gathered': 8053,
 'cow': 5019,
 'slaughter': 17821,
 'course': 5003,
 'stop': 18392,
 'leather': 11335,
 'manufacturing': 12075,
 'happens': 8730,
 'wondering': 20968,
 'educated': 6617,
 'ambassador': 1548,
 'struggling': 18461,
 'pay': 14319,
 'credit': 5092,
 'debit': 5415,
 'decent': 5434,
 'restaurant': 16312,
 'imagine': 9502,
 'diplomat': 6005,
 'developed': 5812,
 'nation': 13180,
 'having': 8845,
 'card': 3741,
 'needs': 13276,
 'dinner': 5995,
 'inflation': 9747,
 'react': 15848,
 

In [59]:
x_test_cv=tf_loaded.fit_transform(df_test['text'])

In [60]:
x_test_cv.shape

(3001, 9949)

In [None]:
def decrpyt(msg,skip=1):
    dec=''
    ran=[x for x in range(97,123)]
#     print(ran)
    for ch in msg:
        temp=ord(ch)+skip+1
        if ord(ch) in ran:
            if temp not in ran:
#                 print(temp,ch)
                temp=chr(temp-26)
#                 print(temp)
            else:
                temp=chr(temp)
            dec+=temp
        else:
            dec+=ch
    return dec

In [None]:
decrpyt("f elmb vlr afakq qoxkpixqb fq yv exka. qexqp texq zljmrqbop xob clo. alfkd fq fk yv exka fp fkbccfzfbkq xka qexq'p tev qefp qbuq fp pl ilkd. rpfkd pqofkd.jxhbqoxkp() fp obzljjbkaba. klt xmmiv lk qeb roi.",skip=2)

In [None]:
def encrypt(msg,skip=1):
    enc=''
    ran=[x for x in range(97,123)]
#     print(ran)
    for ch in msg:
        temp=ord(ch)-skip-1
        if ord(ch) in ran:
            if temp not in ran:
#                 print(temp,ch)
                temp=chr(temp+26)
#                 print(temp)
            else:
                temp=chr(temp)
            enc+=temp
        else:
            enc+=ch
    return enc

In [None]:
encrypt("i hope you didnt translate it by hand. thats what computers are for. doing it in by hand is inefficient and that's why this text is so long. using string.maketrans() is recommended. now apply on the url.",skip=2)

In [None]:
msg="f elmb vlr afakq qoxkpixqb fq yv exka. qexqp texq zljmrqbop xob clo. alfkd fq fk yv exka fp fkbccfzfbkq xka qexq'p tev qefp qbuq fp pl ilkd. rpfkd pqofkd.jxhbqoxkp() fp obzljjbkaba. klt xmmiv lk qeb roi."

# Preprocessing of text

In [162]:
demo=pd.read_csv('training_data/english/agr_en_train.csv',names=['id','text','result'])

In [163]:
demo.head()

Unnamed: 0,id,text,result
0,facebook_corpus_msr_1723796,Well said sonu..you have courage to stand agai...,OAG
1,facebook_corpus_msr_466073,"Most of Private Banks ATM's Like HDFC, ICICI e...",NAG
2,facebook_corpus_msr_1493901,"Now question is, Pakistan will adhere to this?",OAG
3,facebook_corpus_msr_405512,Pakistan is comprised of fake muslims who does...,OAG
4,facebook_corpus_msr_1521685,"??we r against cow slaughter,so of course it w...",NAG


In [185]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import string


def preprocess(document):
    sentence = document.lower()
    stopwords=nltk.corpus.stopwords.words('english')
    punctuations=string.punctuation
#     print(punctuations)
    sentence_words = nltk.word_tokenize(sentence)
    for word in sentence_words:
        if word in punctuations  or word in stopwords:
            sentence_words.remove(word)

    sentence_words
#     print("{0:20}{1:20}{2:20}".format("Word","Lemma verb","Lemma noun"))
    lem_v=[]
    lem_n=[]
    for word in sentence_words:
    #     print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))
        temp_v=wordnet_lemmatizer.lemmatize(word,pos='v')
        lem_v.append(temp_v)
        temp_n=wordnet_lemmatizer.lemmatize(temp_v,pos='n')
        lem_n.append(temp_n)
#         print ("{0:20}{1:20}{2:20}".format(word,temp_v,temp_n))

#     print(' '.join(lem_n))
    return ' '.join(lem_n)

In [186]:
demo.head()

Unnamed: 0,id,text,result,preprocessed_text
0,facebook_corpus_msr_1723796,Well said sonu..you have courage to stand agai...,OAG,well say sonu..you courage stand dadagiri muslim
1,facebook_corpus_msr_466073,"Most of Private Banks ATM's Like HDFC, ICICI e...",NAG,private bank atm 's like hdfc icici etc out of...
2,facebook_corpus_msr_1493901,"Now question is, Pakistan will adhere to this?",OAG,"question , pakistan adhere this"
3,facebook_corpus_msr_405512,Pakistan is comprised of fake muslims who does...,OAG,pakistan comprise fake muslim do know mean uni...
4,facebook_corpus_msr_1521685,"??we r against cow slaughter,so of course it w...",NAG,? r cow slaughter so course will stop leather ...


# Convert the below cells and run after completing the demo

import multiprocessing

def process_first_half(index):
        demo['preprocessed_text'][:index]=demo['text'][:index].apply(lambda x: preprocess(x))
def process_second_half(index):
    demo['preprocessed_text'][index:]=demo['text'][index:].apply(lambda x: preprocess(x))

length=len(demo)%2
if len(demo)%2==0:
    p1 = multiprocessing.Process(target=process_first_half,args=(length,))
    p2 = multiprocessing.Process(target=process_second_half,args=(length,))

else:
    length=int(len(demo/2))
    p1 = multiprocessing.Process(target=process_first_half,args=(length,))
    p2 = multiprocessing.Process(target=process_second_half,args=(length,))


p1.start()
p2.start()

p1.join()
p2.join()

In [187]:
import datetime
start=datetime.datetime.now()
demo['preprocessed_text']=demo['text'].apply(lambda x: preprocess(x))
end=datetime.datetime.now()
print('Total preprocessing time:- {}'.format(end-start))

Total preprocessing time:- 0:00:09.105910


In [188]:
demo.head()

Unnamed: 0,id,text,result,preprocessed_text
0,facebook_corpus_msr_1723796,Well said sonu..you have courage to stand agai...,OAG,well say sonu..you courage stand dadagiri muslim
1,facebook_corpus_msr_466073,"Most of Private Banks ATM's Like HDFC, ICICI e...",NAG,private bank atm 's like hdfc icici etc out of...
2,facebook_corpus_msr_1493901,"Now question is, Pakistan will adhere to this?",OAG,"question , pakistan adhere this"
3,facebook_corpus_msr_405512,Pakistan is comprised of fake muslims who does...,OAG,pakistan comprise fake muslim do know mean uni...
4,facebook_corpus_msr_1521685,"??we r against cow slaughter,so of course it w...",NAG,? r cow slaughter so course will stop leather ...


In [190]:
demo.loc[demo['result']=='NAG','result']=0
demo.loc[demo['result']=='CAG','result']=50
demo.loc[demo['result']=='OAG','result']=100
# demo_x=demo['preprocessed_text']
# demo_y=demo['result'].astype('int')

In [191]:
demo_x=demo['preprocessed_text']
demo_y=demo['result'].astype('int')

In [196]:
demo_vec=TfidfVectorizer()
train_vec=demo_vec.fit_transform(demo_x)

In [355]:
demo_vec

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [370]:
for e in demo_vec.vocabulary_:
    if e in ['we', 'take', 'need', 'mother', 'live', 'india', 'gaddar',
       'fucker', 'first', 'discus', 'action']:
        print(e,demo_vec.vocabulary_[e])
        print('idf:- ',demo_vec.idf_[demo_vec.vocabulary_[e]])

need 11389
idf:-  4.274564730728789
take 16017
idf:-  4.0665124556150385
we 17586
idf:-  4.038291887972787
india 8176
idf:-  3.3313275618596996
first 6375
idf:-  4.726235240657705
live 9864
idf:-  4.912023005428146
mother 10984
idf:-  5.961845129926823
discus 5183
idf:-  6.061928588483807
action 1035
idf:-  5.362224007377702
gaddar 6692
idf:-  9.294049640102028
fucker 6626
idf:-  9.699514748210191


In [397]:
demo_lr=LogisticRegression(C=0.9, solver='lbfgs', multi_class='multinomial', max_iter=50, class_weight='balanced',random_state=4)
demo_lr.fit(train_vec,demo_y)



LogisticRegression(C=0.9, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=50,
          multi_class='multinomial', n_jobs=None, penalty='l2',
          random_state=4, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False)

In [398]:
x_testcv=demo_vec.transform(["Don't discuss just take action and we need to first all mother fucker gaddar who live in India"])

In [399]:
demo_lr.score(train_vec,demo_y)

0.8005667138928244

In [350]:
0.5554815061646118<0.5631456181272909



True

In [391]:
now=datetime.datetime.now()
now=str(now).replace('-','_').replace(' ','_').replace(':','_').split('.')[0]
print(now)

model_filename='lr_50_iter_balanced_rs_{}.smamodel'.format(str(now))
vectorizer_filename='tfidf_lemma_{}.smavec'.format(str(now))

print(model_filename)
print(vectorizer_filename)

model_file_ptr=open(model_filename,'wb')
vectorizer_file_ptr=open(vectorizer_filename,'wb')

pickle.dump(demo_lr,model_file_ptr)
pickle.dump(demo_vec.vocabulary_,vectorizer_file_ptr)

model_file_ptr.close()
vectorizer_file_ptr.close()

2019_07_01_16_22_04
lr_50_iter_balanced_rs_2019_07_01_16_22_04.smamodel
tfidf_lemma_2019_07_01_16_22_04.smavec


In [392]:
loaded_model = pickle.load(open('lr_50_iter_balanced_rs_2019_07_01_16_22_04.smamodel', 'rb'))
vector_loaded = pickle.load(open('tfidf_lemma_2019_07_01_16_22_04.smavec', 'rb'))
tfidf = TfidfVectorizer(vocabulary=vector_loaded)
text_vec = tfidf.fit_transform(["Don't discuss just take action and we need to first all mother fucker gaddar who live in India"])
print(text_vec.shape)
tfidf.stop_words = 'english'
# print(vectorizer.stop_words)
print(tfidf.inverse_transform(text_vec))
op = loaded_model.predict(text_vec)
print(op)

(1, 18290)
[array(['who', 'we', 'to', 'take', 'need', 'mother', 'live', 'just',
       'india', 'in', 'gaddar', 'fucker', 'first', 'don', 'and', 'all',
       'action'], dtype='<U127')]
[100]


In [384]:
vector_loaded = pickle.load(open('tfidf_lemma_2019_07_01_16_07_35.smavec', 'rb'))
tfidf = TfidfVectorizer(vocabulary=vector_loaded)

In [387]:
text_vec = tfidf.fit_transform(["Don't discuss just take action and we need to first all mother fucker gaddar who live in India"])
print(text_vec.shape)
tfidf.stop_words = 'english'
# print(vectorizer.stop_words)
print(tfidf.inverse_transform(text_vec))
op = loaded_model.predict(text_vec)
print(op)

(1, 18290)
[array(['who', 'we', 'to', 'take', 'need', 'mother', 'live', 'just',
       'india', 'in', 'gaddar', 'fucker', 'first', 'don', 'and', 'all',
       'action'], dtype='<U127')]
[100]
