In [1]:
import sys, os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
# read csv's
df_train = pd.read_csv('./challenge_train_.csv',index_col=0, memory_map=True)
df_test = pd.read_csv('./challenge_test_.csv', index_col=0, memory_map=True)

FileNotFoundError: [Errno 2] File b'./challenge_train_.csv' does not exist: b'./challenge_train_.csv'

# Exporatory data analysis

In [None]:
df_train.info()

In [None]:
df_test.info()

## Drop duplicates

### in _train_ set

In [None]:
print('duplicated rows: ',df_train.duplicated().sum())
df_train.drop_duplicates(inplace=True)

### in _test_ set

In [None]:
print('duplicated rows: ',df_test.duplicated().sum())
df_test.drop_duplicates(inplace=True)

### between  _train_ and _test_ set (I keep rows in _train_ set)

In [None]:
df_merge = df_train.merge(df_test.loc[:,'text'], on='text', )
print('text duplicated: {}'.format(df_merge.shape[0]))
# display(df_merge)

df_train = df_train[~df_train.text.isin(df_merge.text)]
df_test = df_test[~df_test.text.isin(df_merge.text)]

## _sentiment_ proportions in _train_ set

In [None]:
# (df_train.sentiment.value_counts() / df_train.sentiment.count() * 100).plot(kind='pie');
df_train.sentiment.value_counts().plot(kind='pie');

## _text_ lenght in  _train_ set

In [None]:
df_train.text.apply(len).describe()

In [None]:
# df_train.text.apply(len).value_counts().hist(bins=50);
df_train.text.apply(len).hist(bins=50);
plt.xlabel('quantity of characteres')
plt.ylabel('quantity of reviews')
plt.title('Quantity of reviews by text lenght');

# ML model

## Data cleaning

In [None]:
import string

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text
    text = [word for word in text.split(" ")]
    #  remove puncutation
    text = [word.strip(string.punctuation) for word in text]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]

    # join all
    text = " ".join(text)
    return(text)

In [None]:
df_train['clean_text'] = df_train['text'].apply(lambda x: clean_text(x))

In [None]:
X = df_train.clean_text
y = df_train.sentiment

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC())
])

parameters = {
    'vect__ngram_range': [(1, 2)],
#     'tfidf__use_idf': (True, False),
#     'clf__tol': [1e-4]
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1, scoring='accuracy')

gs_clf = gs_clf.fit(X, y)

# for param_name in sorted(parameters.keys()):
#     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

# Estimated score

In [None]:
print(gs_clf.best_score_)

# Predictions 

In [None]:
df_test['clean_text'] = df_test['text'].apply(lambda x: clean_text(x))

In [None]:
X_test = df_test.clean_text
y_pred = gs_clf.predict(X_test)

In [None]:
df_test.drop(columns=['sentiment'], inplace=True)
df_test['sentiment'] = y_pred

In [None]:
df_test[['text', 'sentiment']].sort_index().to_csv('./test.csv')