# Logistic Regression

In [1]:
import pandas as pd
df = pd.read_excel (r'C:\Users\HP\Downloads\Dataset.xlsx')
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,INTENT,ID,Unnamed: 2
0,Please clear the italics sentence,5,
1,Stop dictation near the last paragraph,23,
2,left align the first word of selected paragraph,16,
3,clear bold for last two sentences,3,
4,insert comment on the selected line,15,


In [2]:
df = pd.DataFrame(df) 
df.drop(['Unnamed: 2'], axis=1)
df.dropna(subset=['INTENT'], inplace=True)
df = df[['ID', 'INTENT']]
df.head()

Unnamed: 0,ID,INTENT
0,5,Please clear the italics sentence
1,23,Stop dictation near the last paragraph
2,16,left align the first word of selected paragraph
3,3,clear bold for last two sentences
4,15,insert comment on the selected line


In [3]:
import re

def tokenize(text):
    tokens=re.split('\W+',text)
    return tokens
df['intent_tokenized']=df['INTENT'].apply(lambda x:tokenize(x.lower()))
df.head()

Unnamed: 0,ID,INTENT,intent_tokenized
0,5,Please clear the italics sentence,"[please, clear, the, italics, sentence]"
1,23,Stop dictation near the last paragraph,"[stop, dictation, near, the, last, paragraph]"
2,16,left align the first word of selected paragraph,"[left, align, the, first, word, of, selected, ..."
3,3,clear bold for last two sentences,"[clear, bold, for, last, two, sentences]"
4,15,insert comment on the selected line,"[insert, comment, on, the, selected, line]"


In [4]:
import nltk
#stopword=nltk.download('stopwords')
stopword=nltk.corpus.stopwords.words('english')
stopword.append("please")
stopword.append("could")
stopword.append("need")
stopword.append(" ")
stopword.append("kindly")
def remove_stopword(tokenised_list):
    text=[word for word in tokenised_list if word not in stopword]
    return text
df['intent_nostop']=df['intent_tokenized'].apply(lambda x:remove_stopword(x))
df.head()

Unnamed: 0,ID,INTENT,intent_tokenized,intent_nostop
0,5,Please clear the italics sentence,"[please, clear, the, italics, sentence]","[clear, italics, sentence]"
1,23,Stop dictation near the last paragraph,"[stop, dictation, near, the, last, paragraph]","[stop, dictation, near, last, paragraph]"
2,16,left align the first word of selected paragraph,"[left, align, the, first, word, of, selected, ...","[left, align, first, word, selected, paragraph]"
3,3,clear bold for last two sentences,"[clear, bold, for, last, two, sentences]","[clear, bold, last, two, sentences]"
4,15,insert comment on the selected line,"[insert, comment, on, the, selected, line]","[insert, comment, selected, line]"


In [5]:
ps=nltk.PorterStemmer()
def stemming(tokenised_list):
    text=[ps.stem(word) for word in tokenised_list ]
    return text
df['intent_nostop_stem']=df['intent_nostop'].apply(lambda x:stemming(x))
df.head()

Unnamed: 0,ID,INTENT,intent_tokenized,intent_nostop,intent_nostop_stem
0,5,Please clear the italics sentence,"[please, clear, the, italics, sentence]","[clear, italics, sentence]","[clear, ital, sentenc]"
1,23,Stop dictation near the last paragraph,"[stop, dictation, near, the, last, paragraph]","[stop, dictation, near, last, paragraph]","[stop, dictat, near, last, paragraph]"
2,16,left align the first word of selected paragraph,"[left, align, the, first, word, of, selected, ...","[left, align, first, word, selected, paragraph]","[left, align, first, word, select, paragraph]"
3,3,clear bold for last two sentences,"[clear, bold, for, last, two, sentences]","[clear, bold, last, two, sentences]","[clear, bold, last, two, sentenc]"
4,15,insert comment on the selected line,"[insert, comment, on, the, selected, line]","[insert, comment, selected, line]","[insert, comment, select, line]"


In [6]:
#nltk.download('wordnet')
lm=nltk.WordNetLemmatizer()
def lemmatizing(tokenised_list):
    text=[lm.lemmatize(word) for word in tokenised_list ]
    return text
df['intent_nostop_lem']=df['intent_nostop'].apply(lambda x:lemmatizing(x))
df.head()

Unnamed: 0,ID,INTENT,intent_tokenized,intent_nostop,intent_nostop_stem,intent_nostop_lem
0,5,Please clear the italics sentence,"[please, clear, the, italics, sentence]","[clear, italics, sentence]","[clear, ital, sentenc]","[clear, italic, sentence]"
1,23,Stop dictation near the last paragraph,"[stop, dictation, near, the, last, paragraph]","[stop, dictation, near, last, paragraph]","[stop, dictat, near, last, paragraph]","[stop, dictation, near, last, paragraph]"
2,16,left align the first word of selected paragraph,"[left, align, the, first, word, of, selected, ...","[left, align, first, word, selected, paragraph]","[left, align, first, word, select, paragraph]","[left, align, first, word, selected, paragraph]"
3,3,clear bold for last two sentences,"[clear, bold, for, last, two, sentences]","[clear, bold, last, two, sentences]","[clear, bold, last, two, sentenc]","[clear, bold, last, two, sentence]"
4,15,insert comment on the selected line,"[insert, comment, on, the, selected, line]","[insert, comment, selected, line]","[insert, comment, select, line]","[insert, comment, selected, line]"


In [7]:
stopwords_intent=[]
def clean_text_stem(text):
    tokens=re.split('\W+',text)
    [stopwords_intent.append(word) for word in tokens if word in stopword]
    text=[ps.stem(word) for word in tokens if word not in stopword]
    text=[word for word in text if word not in stopword]
    return text

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect_stem=CountVectorizer(analyzer=clean_text_stem)
X_counts_stem=count_vect_stem.fit_transform(df['INTENT'])
print(X_counts_stem.shape)
print(count_vect_stem.get_feature_names())

(441, 86)
['', 'Go', 'add', 'align', 'bold', 'break', 'bullet', 'capit', 'centr', 'chang', 'charact', 'clear', 'color', 'command', 'comment', 'content', 'delet', 'dictat', 'display', 'end', 'eras', 'first', 'five', 'format', 'four', 'full', 'go', 'halt', 'help', 'icon', 'imag', 'insert', 'ital', 'italic', 'italicis', 'kindli', 'last', 'left', 'let', 'letter', 'line', 'list', 'make', 'middl', 'move', 'near', 'new', 'next', 'open', 'page', 'paragraph', 'paus', 'pictur', 'place', 'pleas', 'posit', 'present', 'previou', 'put', 'red', 'remov', 'requir', 'reveal', 'right', 'select', 'sentenc', 'shift', 'show', 'start', 'stop', 'strike', 'strikethrough', 'subscript', 'superscript', 'symbol', 'tabl', 'take', 'text', 'textoutlin', 'three', 'two', 'unbold', 'underlin', 'undo', 'unitalic', 'word']


In [1]:
X_counts_stem_df=pd.DataFrame(X_counts_stem.toarray())
X_counts_stem_df.columns=count_vect_stem.get_feature_names()
X_counts_stem_df.head()


NameError: name 'pd' is not defined

In [10]:
X_features=X_counts_stem_df
X_features.head()

Unnamed: 0,Unnamed: 1,Go,add,align,bold,break,bullet,capit,centr,chang,...,take,text,textoutlin,three,two,unbold,underlin,undo,unitalic,word
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Result 1

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_features, df['ID'], test_size=0.0, random_state=0)
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
clf.score(X_train, y_train)



0.9931972789115646

In [13]:
from sklearn.externals import joblib 
import os
# Save the model as a pickle in a file 
joblib.dump(clf, 'clf')
os.path.getsize('clf')

19157

In [14]:
##from sklearn.model_selection import KFold, cross_val_score
#k_fold=KFold(n_splits=10)
#cross_val_score(clf,X_features,df['ID'],cv=k_fold,scoring='accuracy',n_jobs=-1)

# Analysis

In [15]:
print(stopwords_intent[:5])

['the', 'the', 'the', 'of', 'for']


In [16]:
print(df["intent_nostop_stem"][:5])

0                           [clear, ital, sentenc]
1            [stop, dictat, near, last, paragraph]
2    [left, align, first, word, select, paragraph]
3                [clear, bold, last, two, sentenc]
4                  [insert, comment, select, line]
Name: intent_nostop_stem, dtype: object


In [17]:
all_token_intent=[]
all_token_intent=all_token_intent+[(lis) for lis in df["intent_nostop_stem"]]

In [18]:
print(all_token_intent[:5])

[['clear', 'ital', 'sentenc'], ['stop', 'dictat', 'near', 'last', 'paragraph'], ['left', 'align', 'first', 'word', 'select', 'paragraph'], ['clear', 'bold', 'last', 'two', 'sentenc'], ['insert', 'comment', 'select', 'line']]


In [19]:
flat_list = [item for sublist in all_token_intent for item in sublist]

In [20]:
freq={}
def CountFrequency(my_list): 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
CountFrequency(flat_list)
print(freq)

{'clear': 35, 'ital': 13, 'sentenc': 27, 'stop': 20, 'dictat': 36, 'near': 18, 'last': 108, 'paragraph': 39, 'left': 13, 'align': 43, 'first': 20, 'word': 73, 'select': 51, 'bold': 28, 'two': 40, 'insert': 25, 'comment': 15, 'line': 29, 'delet': 20, 'display': 7, 'list': 2, 'command': 21, 'posit': 3, 'imag': 6, 'middl': 5, 'page': 6, 'remov': 78, 'underlin': 41, 'format': 23, 'subscript': 32, 'text': 47, 'strikethrough': 32, '': 26, 'add': 5, 'open': 9, 'unbold': 6, 'help': 13, 'bullet': 38, 'undo': 18, 'chang': 5, 'start': 9, 'next': 12, 'superscript': 32, 'paus': 13, 'charact': 32, 'three': 7, 'centr': 17, 'make': 10, 'full': 1, 'italic': 7, 'previou': 1, 'go': 4, 'right': 19, 'tabl': 9, 'end': 10, 'content': 4, 'new': 2, 'eras': 3, 'unitalic': 4, 'place': 2, 'five': 1, 'show': 10, 'italicis': 4, 'move': 6, 'requir': 1, 'strike': 2, 'letter': 2, 'shift': 1, 'icon': 3, 'capit': 1, 'symbol': 1, 'take': 2, 'break': 2, 'halt': 3, 'present': 3, 'red': 1, 'color': 1, 'four': 1, 'reveal': 1

In [30]:
from sklearn.model_selection import cross_val_predict
y_test_pred = cross_val_predict(clf,X_train,y_train,cv=7)



In [31]:
y_test_pred

array([16, 22,  8,  7,  2, 15,  6,  1,  5, 20, 11, 20, 25, 18,  1, 26,  2,
       23, 18, 22, 17,  9, 24,  9, 11, 12,  4, 25,  9,  8, 13,  6, 23, 24,
       13,  7, 26,  7,  6, 26, 24,  1,  6,  2, 17, 13,  6,  1, 23, 23, 24,
       15, 25, 11, 17, 12,  3, 18,  7, 26, 14, 22, 11, 24,  9,  5, 21, 22,
       10,  6, 11, 17, 13, 12, 17, 14, 22,  2, 15, 14,  1,  1,  6, 22, 10,
       26,  4, 10,  4,  9,  2, 15,  2,  5, 15,  9, 12, 23,  2, 14, 16, 26,
        3, 22, 21, 24, 13, 24, 15,  9,  8, 26,  3,  6, 12,  9,  7, 14,  8,
        4, 18, 13, 16, 13, 18, 23, 23,  6, 25, 18,  9,  3, 21, 14, 18, 24,
       22, 24, 15, 19,  2, 25, 11, 17, 17, 13, 26,  6, 14,  6, 16,  7, 26,
       22,  7, 18, 24, 14, 20,  2,  5,  3, 15,  6,  3, 15, 10, 26, 18, 23,
       15, 11,  4, 18,  7,  1, 13, 17, 26, 13, 26, 24, 17, 11, 15, 18, 18,
        2, 19, 16,  7, 21, 26,  5,  4,  6, 26, 14, 26,  6, 14, 13, 18, 18,
       14, 21,  5, 15,  4, 14, 16, 10, 26,  5, 23,  8, 23, 21,  5, 18,  1,
       19, 17,  2, 11,  6

In [32]:
x=[]
for i in y_train.index:
    x.append(df.iloc[i]["ID"])

In [33]:
x

[16,
 22,
 8,
 7,
 2,
 15,
 6,
 1,
 5,
 20,
 11,
 20,
 25,
 18,
 1,
 26,
 2,
 23,
 18,
 22,
 17,
 9,
 24,
 9,
 11,
 12,
 4,
 25,
 9,
 8,
 13,
 6,
 23,
 24,
 13,
 7,
 26,
 7,
 6,
 26,
 24,
 1,
 6,
 2,
 17,
 13,
 6,
 1,
 23,
 23,
 24,
 15,
 25,
 11,
 17,
 12,
 3,
 18,
 7,
 26,
 14,
 22,
 11,
 24,
 9,
 5,
 21,
 22,
 10,
 6,
 11,
 17,
 13,
 12,
 17,
 14,
 22,
 2,
 15,
 14,
 1,
 1,
 6,
 22,
 10,
 26,
 4,
 10,
 4,
 9,
 2,
 15,
 2,
 5,
 15,
 9,
 12,
 23,
 2,
 14,
 16,
 26,
 3,
 22,
 21,
 24,
 13,
 24,
 15,
 9,
 8,
 26,
 3,
 6,
 12,
 9,
 7,
 14,
 8,
 4,
 18,
 13,
 16,
 13,
 18,
 23,
 23,
 7,
 25,
 18,
 9,
 3,
 21,
 14,
 18,
 24,
 22,
 24,
 15,
 19,
 2,
 25,
 11,
 17,
 17,
 13,
 26,
 7,
 14,
 6,
 16,
 7,
 26,
 22,
 7,
 18,
 24,
 14,
 20,
 2,
 4,
 3,
 15,
 6,
 3,
 15,
 10,
 26,
 18,
 23,
 15,
 11,
 4,
 18,
 7,
 1,
 13,
 17,
 26,
 13,
 26,
 24,
 17,
 11,
 15,
 18,
 18,
 2,
 19,
 16,
 7,
 21,
 26,
 5,
 4,
 6,
 26,
 14,
 26,
 6,
 14,
 13,
 18,
 18,
 14,
 19,
 5,
 15,
 4,
 14,
 16,
 10,
 26,
 5,
 23

1) confused between remove or apply
2) some remove sentences have undo like undo strikethrough it is taking simple undo under that category

In [34]:
from sklearn import metrics
labels=[]
for i in range(1,27):
    labels.append(i)
labels

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26]

In [35]:
print(metrics.confusion_matrix(x, y_test_pred, labels=labels))
print(metrics.classification_report(x, y_test_pred, labels=labels))

[[18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0 11  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0 24  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0  2 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0  0  0 16  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0  0  0  0 16  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 11  2  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0

In [36]:
id_intent_map={1:"undo",2:"bold",3:"unbold",4:"italics",5:"unitalics",6:"underline",7:"un-underline",8:"superscript",9:"un-superscript",10:"subscript",11:"un-subscript",12:"strike",13:"unstrike",14:"center align",15:"comment",16:"left align",17:"right align",18:"un-format",19:"insert bullet",20:"next bullet",21:"end bullet",22:"pause dictation",23:"stop dictation",24:"commands",25:"help",26:"delete"}

In [37]:
input={'str':["exit bullets"]}
x_df = pd.DataFrame(input)
x_str=count_vect_stem.transform(x_df['str'])
x_str_stem_df=pd.DataFrame(x_str.toarray())
x_str_stem_df.columns=count_vect_stem.get_feature_names()
y_str=clf.predict(x_str_stem_df)
id_intent_map[y_str[0]]

'end bullet'

In [38]:
from sklearn_porter import Porter

In [39]:
porter = Porter(clf, language='java')

ValueError: Currently the given estimator 'LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)' isn't supported.