In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

In [6]:
train = pd.read_excel("data/train.xlsx")
test = pd.read_excel("data/test.xlsx")

In [7]:
pd.set_option("max_colwidth",1)

In [8]:
train.head()

Unnamed: 0,STORY,SECTION
0,"But the most painful was the huge reversal in fee income, unheard of among private sector lenders. Essentially, it means that Yes Bank took it for granted that fees on structured loan deals will be paid and accounted for upfront on its books. As borrowers turned defaulters, the fees tied to these loan deals fell off the cracks. Gill has now vowed to shift to a safer accounting practice of amortizing fee income rather than booking these upfront.\n\n\nGill’s move to mend past ways means that there will be no nasty surprises in the future. This is good news considering that investors love a clean image and loathe uncertainties.\n\n\nBut there is no gain without pain and the promise of a strong and stable balance sheet comes with some sacrifices as well. Investors will have to give up the hopes of phenomenal growth, a promise made by Kapoor.",3
1,"How formidable is the opposition alliance among Congress, Jharkhand Mukti Morcha (JMM) and Jharkhand Vikas Morcha (Prajatantrik)?",0
2,"Most Asian currencies were trading lower today. South Korean won was down 0.4%, China renminbi 0.23%, China Offshore 0.15%, Malaysian ringgit 0.12%, Indonesian rupiah 0.11%, Taiwan dollar 0.06%. However, Japanese yen was up 0.32%.\n\n\nThe dollar index, which measures the US currency’s strength against major currencies, was trading at 97.26, down 0.14% from its previous close of 97.395.",3
3,"If you want to answer any question, click on ‘Answer’. After clicking on Answer you can also check out replies of other users. Proceed to Answer either through writing or voice command.\n\n\nIf you want to ask any question, click on “Ask A Question"". Few question prefixes are already inserted to help you with your questions. After submitting your question, the app will send the questions to your neighbours and will let you know how many neighbours\n\n\nhave been asked the question. Click Done. You can click on the bell icon on the homepage of the app to follow answers to your questions.",1
4,"In global markets, gold prices edged up today as disappointing Chinese factory activity data brought back concerns about the health of the global economy, denting risk appetite. Spot gold rose 0.4% to $1,285 per ounce. European equity markets nudged down today, following weaker Asian stock markets.",3


In [9]:
def remove_pattern(pattern,text):
    text = re.sub(pattern," ",text)

In [10]:
pattern = "[^a-zA-Z]"
train["STORY"] = train["STORY"].apply([lambda x:re.sub(pattern," ",x)])

In [11]:
train["STORY"] = train["STORY"].apply(lambda x:x.lower().split())

In [12]:
stop = stopwords.words('english')

In [13]:
train["STORY"] = train["STORY"].apply(lambda x:[i for i in x if i not in stop])

In [14]:
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()

In [15]:
train_stem = train.copy()
train_lemma = train.copy()

In [16]:
train_stem["STORY"] = train_stem["STORY"].apply(lambda x:[stemmer.stem(i) for i in x])
train_lemma["STORY"] = train_lemma["STORY"].apply(lambda x:[lemma.lemmatize(i) for i in x])

In [76]:
train_stem.head()

Unnamed: 0,STORY,SECTION
0,pain huge revers fee incom unheard among privat sector lender essenti mean ye bank took grant fee structur loan deal paid account upfront book borrow turn default fee tie loan deal fell crack gill vow shift safer account practic amort fee incom rather book upfront gill move mend past way mean nasti surpris futur good news consid investor love clean imag loath uncertainti gain without pain promis strong stabl balanc sheet come sacrific well investor give hope phenomen growth promis made kapoor,3
1,formid opposit allianc among congress jharkhand mukti morcha jmm jharkhand vika morcha prajatantrik,0
2,asian currenc trade lower today south korean china renminbi china offshor malaysian ringgit indonesian rupiah taiwan dollar howev japanes yen dollar index measur us currenc strength major currenc trade previou close,3
3,want answer question click answer click answer also check repli user proceed answer either write voic command want ask question click ask question question prefix alreadi insert help question submit question app send question neighbour let know mani neighbour ask question click done click bell icon homepag app follow answer question,1
4,global market gold price edg today disappoint chines factori activ data brought back concern health global economi dent risk appetit spot gold rose per ounc european equiti market nudg today follow weaker asian stock market,3


In [17]:
train_stem["STORY"] = train_stem["STORY"].apply(lambda x:' '.join(x))
train_lemma["STORY"] = train_lemma["STORY"].apply(lambda x:' '.join(x))

In [13]:
train_lemma.head()

Unnamed: 0,STORY,SECTION
0,But painful huge reversal fee income unheard among private sector lender Essentially mean Yes Bank took granted fee structured loan deal paid accounted upfront book As borrower turned defaulter fee tied loan deal fell crack Gill vowed shift safer accounting practice amortizing fee income rather booking upfront Gill move mend past way mean nasty surprise future This good news considering investor love clean image loathe uncertainty But gain without pain promise strong stable balance sheet come sacrifice well Investors give hope phenomenal growth promise made Kapoor,3
1,How formidable opposition alliance among Congress Jharkhand Mukti Morcha JMM Jharkhand Vikas Morcha Prajatantrik,0
2,Most Asian currency trading lower today South Korean China renminbi China Offshore Malaysian ringgit Indonesian rupiah Taiwan dollar However Japanese yen The dollar index measure US currency strength major currency trading previous close,3
3,If want answer question click Answer After clicking Answer also check reply user Proceed Answer either writing voice command If want ask question click Ask A Question Few question prefix already inserted help question After submitting question app send question neighbour let know many neighbour asked question Click Done You click bell icon homepage app follow answer question,1
4,In global market gold price edged today disappointing Chinese factory activity data brought back concern health global economy denting risk appetite Spot gold rose per ounce European equity market nudged today following weaker Asian stock market,3


In [18]:
tfidf = TfidfVectorizer()
vector = tfidf.fit_transform(train_stem["STORY"])
X_stem = vector.toarray()
#vector = tfidf.fit_transform(train_lemma["STORY"])
#X_lemma = vector.toarray()

In [14]:
y = train["SECTION"]

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X_stem,y,test_size=0.3,random_state=0)

In [38]:
log = LogisticRegression()

In [39]:
log.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
y_pred = log.predict(X_test)

In [41]:
acc = accuracy_score(y_test,y_pred)
acc

0.9694189602446484

In [54]:
nb = MultinomialNB()
nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)

In [55]:
acc = accuracy_score(y_test,y_pred)
acc

0.9305373525557011

In [42]:
linear_svc = LinearSVC(verbose=1,random_state=0)
linear_svc.fit(X_train,y_train)

[LibLinear]

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=1)

In [43]:
y_pred = linear_svc.predict(X_test)

In [44]:
acc = accuracy_score(y_test,y_pred)
acc

0.9781564001747488

In [57]:
precision_recall_fscore_support(y_test,y_pred,average='macro')

(0.9800958993452222, 0.9754468509526514, 0.9776578025475773, None)

In [64]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [65]:
y_pred = rf.predict(X_test)

In [66]:
acc = accuracy_score(y_test,y_pred)
acc

0.9484491044124072

## Test

In [16]:
pattern = "[^a-zA-Z]"
test["STORY"] = test["STORY"].apply([lambda x:re.sub(pattern," ",x)])

In [17]:
test["STORY"] = test["STORY"].apply(lambda x:x.lower().split())

In [18]:
stop = stopwords.words('english')

In [19]:
test["STORY"] = test["STORY"].apply(lambda x:[i for i in x if i not in stop])

In [20]:
test_stem = test.copy()
test_lemma = test.copy()

In [21]:
test_stem["STORY"] = test_stem["STORY"].apply(lambda x:[stemmer.stem(i) for i in x])
test_lemma["STORY"] = test_lemma["STORY"].apply(lambda x:[lemma.lemmatize(i) for i in x])

In [22]:
test_stem["STORY"] = test_stem["STORY"].apply(lambda x:' '.join(x))
test_lemma["STORY"] = test_lemma["STORY"].apply(lambda x:' '.join(x))

In [23]:
vector = tfidf.transform(test_stem["STORY"])
y_stem = vector.toarray()
#vector = tfidf.transform(test_lemma["STORY"])
#y_lemma = vector.toarray()

In [33]:
y_stem.shape

(2748, 21659)

In [34]:
X_stem.shape

(7628, 21659)

In [52]:
log = LogisticRegression()

In [36]:
log.fit(X_stem,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
y_pred = log.predict(y_stem)

In [27]:
linear_svc = LinearSVC(random_state=0)
linear_svc.fit(X_stem,y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)

In [28]:
y_pred = linear_svc.predict(y_stem)

In [29]:
submission = pd.DataFrame(y_pred,columns=["SECTION"])

In [30]:
submission

Unnamed: 0,SECTION
0,1
1,2
2,1
3,1
4,1
...,...
2743,1
2744,1
2745,1
2746,0


In [31]:
submission.to_csv("submission.csv")

In [32]:
train_stem.to_csv("treated_train.csv",index=False)
test_stem.to_csv("treated_test.csv",index=False)