In [37]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV

In [2]:
train = pd.read_csv("treated_train.csv")
test = pd.read_csv("treated_test.csv")

In [3]:
train.head()

Unnamed: 0,STORY,SECTION
0,pain huge revers fee incom unheard among priva...,3
1,formid opposit allianc among congress jharkhan...,0
2,asian currenc trade lower today south korean c...,3
3,want answer question click answer click answer...,1
4,global market gold price edg today disappoint ...,3


In [4]:
tfidf = TfidfVectorizer()
vector = tfidf.fit_transform(train["STORY"])
X = vector.toarray()

In [5]:
y = train['SECTION']

In [6]:
vector = tfidf.transform(test["STORY"])
y_test = vector.toarray()

In [7]:
def scores(model,X,y):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_pred,y_test)
    print("Accuracy is ",acc)

In [11]:
def fit_predict(model,X,y,test):
    model.fit(X,y)
    y_pred = model.predict(y_test)
    return model,y_pred

In [12]:
log = LogisticRegression(random_state=0)

In [10]:
scores(log,X,y)

Accuracy is  0.9694189602446484


In [13]:
log,y_pred = fit_predict(log,X,y,y_test)

In [14]:
nb = MultinomialNB()

In [15]:
scores(nb,X,y)

Accuracy is  0.9305373525557011


In [16]:
fit_predict(nb,X,y,y_test)

(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 array([1, 2, 1, ..., 1, 0, 1], dtype=int64))

In [32]:
np.mean(cross_val_score(nb,X,y,scoring='accuracy',cv=10))

0.9442833407291978

In [22]:
linear_svc = LinearSVC(random_state=0)

In [23]:
scores(linear_svc,X,y)

Accuracy is  0.9781564001747488


In [19]:
fit_predict(linear_svc,X,y,y_test)

(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
           verbose=0), array([1, 2, 1, ..., 1, 0, 1], dtype=int64))

In [31]:
np.mean(cross_val_score(linear_svc,X,y,scoring='accuracy',cv=10))

0.9781070370790808

In [43]:
rf = RandomForestClassifier(random_state=0)

In [44]:
scores(rf,X,y)

Accuracy is  0.9484491044124072


In [45]:
np.mean(cross_val_score(rf,X,y,scoring='accuracy',cv=10))

0.9537242477717808

In [38]:
parameters = {'penalty':['l1','l2'],
             'random_state':[0,42,100]}

In [39]:
random = RandomizedSearchCV(estimator=linear_svc,param_distributions=parameters,n_jobs=-1,verbose=1)

In [40]:
random.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.8min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                       fit_intercept=True, intercept_scaling=1,
                                       loss='squared_hinge', max_iter=1000,
                                       multi_class='ovr', penalty='l2',
                                       random_state=0, tol=0.0001, verbose=0),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'penalty': ['l1', 'l2'],
                                        'random_state': [0, 42, 100]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=1)

In [41]:
random.best_params_

{'random_state': 0, 'penalty': 'l2'}

In [29]:
submission = pd.DataFrame(y_pred,columns=["SECTION"])

In [30]:
submission

Unnamed: 0,SECTION
0,1
1,2
2,1
3,1
4,1
...,...
2743,1
2744,1
2745,1
2746,0


In [31]:
submission.to_csv("submission.csv")