In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import TweetTokenizer
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
pd.set_option('max_colwidth',400)

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
train = pd.read_csv('../input/av-hacks/train.csv')
test = pd.read_csv('../input/av-hacks/test.csv')

In [4]:
print('Average count of phrases per sentence in train is {0:.0f}.'.format(train.groupby('drug')['text'].count().mean()))
print('Average count of phrases per sentence in test is {0:.0f}.'.format(test.groupby('drug')['text'].count().mean()))

Average count of phrases per sentence in train is 52.
Average count of phrases per sentence in test is 31.


In [5]:
print('Average word length of text in train is {0:.0f}.'.format(np.mean(train['text'].apply(lambda x: len(x.split())))))
print('Average word length of text phrases in test is {0:.0f}.'.format(np.mean(test['text'].apply(lambda x: len(x.split())))))

Average word length of text in train is 397.
Average word length of text phrases in test is 462.


In [6]:
tokenizer = TweetTokenizer()

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(train['text'].values) + list(test['text'].values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(train['text'])
test_vectorized = vectorizer.transform(test['text'])

In [8]:
y = train['sentiment']

In [9]:
from sklearn.svm import SVC

In [10]:
logreg = LinearSVC()
ovr = OneVsRestClassifier(logreg)

In [11]:
%%time
ovr.fit(train_vectorized, y)

CPU times: user 2.08 s, sys: 12 ms, total: 2.09 s
Wall time: 2.09 s


OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                        fit_intercept=True, intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, tol=0.0001,
                                        verbose=0),
                    n_jobs=None)

In [12]:
scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 73.37%, std 0.21.


In [13]:
pred = ovr.predict(test_vectorized)

In [14]:
d = {'unique_hash': test['unique_hash'], 'sentiment': pred}
df = pd.DataFrame(data=d)

In [15]:
df['sentiment'].value_counts()

2    2814
1      68
0      42
Name: sentiment, dtype: int64

In [16]:
df.to_csv("sub.csv",index=False)