In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("data/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
test = pd.read_csv("data/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [4]:
train["Phrase(origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [5]:
test["Phrase(origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [6]:
def clean_text(phrase):
    phrase = phrase.replace("doesn't ", "does not ")
    phrase = phrase.replace("ca n't ", "can not ")
    phrase = phrase.replace(" n't ", " not ")

    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [7]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Stem phrases

In [8]:
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_phrase(phrase):
    stemmed_words = [stemmer.stem(w) for w in phrase.split(" ")]
    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

tqdm.pandas(desc="Stemming...")
train["Phrase"].progress_apply(stem_phrase).head()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

Stemming...: 100%|██████████| 156060/156060 [00:12<00:00, 12389.16it/s]

(156060, 4)





Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [9]:
tqdm.pandas(desc="Stemming...")
test["Phrase"].progress_apply(stem_phrase).head()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

Stemming...: 100%|██████████| 66292/66292 [00:05<00:00, 13048.42it/s]

(66292, 3)





Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Vectorize phrases

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = TfidfVectorizer(analyzer='char', max_features=10000, ngram_range=(1, 9))
char_vectorizer

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
char_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
X_train_char = char_vectorizer.transform(train["Phrase"])
X_train_char

<156060x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 20095574 stored elements in Compressed Sparse Row format>

In [13]:
X_test_char = char_vectorizer.transform(test["Phrase"])
X_test_char

<66292x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 7897923 stored elements in Compressed Sparse Row format>

In [14]:
word_vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))
word_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [15]:
word_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
X_train_word = word_vectorizer.transform(train["Phrase"])
X_train_word

<156060x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 1443766 stored elements in Compressed Sparse Row format>

In [17]:
X_test_word = word_vectorizer.transform(test["Phrase"])
X_test_word

<66292x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 469299 stored elements in Compressed Sparse Row format>

In [18]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])
X_train

<156060x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 21539340 stored elements in COOrdinate format>

In [19]:
X_test = hstack([X_test_char, X_test_word])
X_test

<66292x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 8367222 stored elements in COOrdinate format>

In [20]:
# columns = word_vectorizer.get_feature_names()
# pd.DataFrame(X_train.tocsr()[:100].toarray(), columns=columns).head()

In [21]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

In [22]:
sentence_ids = train["SentenceId"]

print(sentence_ids.shape)
sentence_ids.head()

(156060,)


PhraseId
1    1
2    1
3    1
4    1
5    1
Name: SentenceId, dtype: int64

## Score

In [23]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(alpha=0.000006762746, random_state=37)
model

SGDClassifier(alpha=6.762746e-06, average=False, class_weight=None,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [24]:
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GroupKFold

kfold = GroupKFold(n_splits=5)

y_predict = cross_val_predict(model, X_train, y_train,
                              cv=kfold, groups=sentence_ids)

print(y_predict.shape)
y_predict[0:10]



(156060,)


array([1, 2, 2, 2, 2, 2, 2, 3, 2, 3])

In [25]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)
print("Score = {0:.5f}".format(score))

Score = 0.60069


In [26]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict
result["Difference(Phrase)"] = np.abs(y_train - y_predict)

print(result.shape)
result.head()

(156060, 6)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapades demonstrating the adage ...,1,A series of escapades demonstrating the adage ...,1,0
2,1,A series of escapades demonstrating the adage ...,2,A series of escapades demonstrating the adage ...,2,0
3,1,A series,2,A series,2,0
4,1,A,2,A,2,0
5,1,series,2,series,2,0


In [27]:
sentiment = result.groupby("SentenceId")["Difference(Phrase)"].mean()
print(sentiment.shape)
sentiment.head()

(8529,)


SentenceId
1    0.253968
2    0.500000
3    0.142857
4    0.425000
5    0.500000
Name: Difference(Phrase), dtype: float64

In [28]:
def find_sentiment(sentence_id):
    return sentiment.loc[sentence_id]

result["Difference(Sentence)"] = result["SentenceId"].apply(find_sentiment)
result = result.sort_values(by="Difference(Sentence)", ascending=False)

print(result.shape)
result.head()

(156060, 7)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase),Difference(Sentence)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
79349,4087,I can not recommend it .,0,I ca n't recommend it .,4,4,4.0
79350,4087,can not recommend it .,0,ca n't recommend it .,4,4,4.0
113298,6020,real snooze .,0,real snooze .,3,3,3.0
113297,6020,A real snooze .,0,A real snooze .,3,3,3.0
12562,539,` This movie sucks . ',0,` This movie sucks . ',3,3,2.8


In [29]:
result[0:1000].to_csv("result.csv")

In [30]:
# vocabulary = vectorizer.get_feature_names()
# vocabulary[0:3]

In [31]:
# pd.DataFrame(vocabulary, columns=["word"]).to_csv("vocabulary.csv")

In [32]:
# result[result["Phrase"].str.contains("can not recommend")]

## Train

In [33]:
# !pip install xgboost

In [34]:
import xgboost as xgb

# params = {
#     'booster': 'gblinear',
#     'objective': 'multi:softmax',
#     'eval_metric': 'merror',
#     'lambda': 2.0,
#     'alpha': 1.0,
#     'lambda_bias': 6.0,
#     'num_class': 5,
#     'nthread': 8,
#     'n_jobs': -1,
#     'silent': 1,
# }

# params = {
#     'booster': 'gblinear',
#     'objective': 'multi:softmax',
#     'eval_metric': 'merror',
#     'lambda': 2.186753e-03,
#     'alpha': 1.286904,
#     'lambda_bias': 6.191707e+00,
#     'num_class': 5,
#     'nthread': 8,
#     'n_jobs': -1,
#     'silent': 1,
# }

model = xgb.XGBClassifier(n_estimators=45,
                          max_depth=6,
                          learning_rate=1.0,
                          max_delta_step=1,
                          nthread=-1,
                          seed=37)

#xgb.XGBClassifier(params)



In [36]:
dtrain = xgb.DMatrix(X_train, label=y_train)

#%time booster = xgb.train(params, dtrain, num_boost_round=98)
model.fit(X_train, y_train)

NameError: name 'params' is not defined

In [None]:
dtest = xgb.DMatrix(X_test.toarray())

predictions = booster.predict(dtest)

print(predictions.shape)
predictions[0:10]

## Submit

In [None]:
submission = pd.read_csv("submissions/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions.astype('int')

print(submission.shape)
submission.head()

In [None]:
from datetime import datetime

current_time = datetime.now()
current_time = current_time.strftime("%Y%m%d_%H%M%S")

submission.to_csv("submissions/xgboost_{0:.5f}_{time}.csv".format(score, time=current_time))