In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit

%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

labels = train.sentiment
classes = set(labels)

train.drop(["id", "sentiment"], axis=1, inplace=True)
test.drop(["id"], axis=1, inplace=True)
train.head()
#classes

Unnamed: 0,author,content
0,richardepryor,"@treasaint salad stuff, some chillis, whatever..."
1,reese,"@sunnyjamiel sunny, I'm a workin' on it. It's ..."
2,mutedriposte,@jolynnchew so early??
3,sakizzie_1102,"So now, I have conjunctivitis in my left eye. ..."
4,poptrash,"Out and about in Deal, Kent. More sunshine req..."


In [3]:
import heapq
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [5]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(min_df=5, ngram_range=(1, 2))),
    ('algo', LogisticRegression())
])

In [9]:
np.linspace(0, 1, 17)

array([0.    , 0.0625, 0.125 , 0.1875, 0.25  , 0.3125, 0.375 , 0.4375,
       0.5   , 0.5625, 0.625 , 0.6875, 0.75  , 0.8125, 0.875 , 0.9375,
       1.    ])

In [13]:
#Подберем параметры с помощью GridSearch
parameters = {
    'vectorizer__ngram_range': ((1, 1), (1, 2)),
    'algo__penalty': ('l1', 'l2'),
    'algo__C': (0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375, 0.5, 0.5625, 0.625, 0.6875, 0.75, 0.8125, 0.875, 
                0.9375,1.)
}

In [14]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [15]:
%time grid_search.fit(train.content, labels)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  6.7min finished


Wall time: 6min 50s


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vectorizer__ngram_range': ((1, 1), (1, 2)), 'algo__penalty': ('l1', 'l2'), 'algo__C': (0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375, 0.5, 0.5625, 0.625, 0.6875, 0.75, 0.8125, 0.875, 0.9375, 1.0)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [16]:
grid_search.best_estimator_.get_params()

{'algo': LogisticRegression(C=0.4375, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 'algo__C': 0.4375,
 'algo__class_weight': None,
 'algo__dual': False,
 'algo__fit_intercept': True,
 'algo__intercept_scaling': 1,
 'algo__max_iter': 100,
 'algo__multi_class': 'ovr',
 'algo__n_jobs': 1,
 'algo__penalty': 'l1',
 'algo__random_state': None,
 'algo__solver': 'liblinear',
 'algo__tol': 0.0001,
 'algo__verbose': 0,
 'algo__warm_start': False,
 'memory': None,
 'steps': [('vectorizer',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=5,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, tok

In [17]:
#Создадим новый pipeline, использую параметры, подобранные gridsearch
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(min_df=5, ngram_range=(1, 1))),
    ('algo', LogisticRegression(C=0.4375, penalty='l1'))
])

In [18]:
pipeline.fit(train.content, labels)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...ty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [19]:
accuracy_score(pipeline.predict(train.content), labels)

0.4210333333333333

In [20]:
preds = pipeline.predict(test.content)

In [21]:
submission = pd.read_csv("sampleSubmission.csv")
submission.sentiment = preds
submission.to_csv("submission_10.csv", index=False)