In [99]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [100]:
# ls

In [101]:
# Loading the dataset
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/train.csv")

In [102]:
# Example of a non disaster tweet
train_df[train_df["target"] == 0]["text"].values[1]

'I love fruits'

In [103]:
# Example of disaster tweet
train_df[train_df["target"] == 1]["text"].values[1]

'Forest fire near La Ronge Sask. Canada'

In [104]:
# Starting with the assumption that words in the tweets will be an indicator of whether they're about
# a real disaster or not, we vectorize the tweets. 
count_vectorizer = feature_extraction.text.CountVectorizer()
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [105]:
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

"""
The above tells us that:

There are 54 unique words (or "tokens") in the first five tweets.
The first tweet contains only some of those unique tokens 
- all of the non-zero counts above are the tokens that DO exist in the first tweet.
"""

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


'\nThe above tells us that:\n\nThere are 54 unique words (or "tokens") in the first five tweets.\nThe first tweet contains only some of those unique tokens \n- all of the non-zero counts above are the tokens that DO exist in the first tweet.\n'

In [106]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [107]:
clf = linear_model.RidgeClassifier()

In [108]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores
# TFIDF, LSA, LSTM / RNNs for optimization 

array([0.60387232, 0.57580105, 0.64516129])

In [109]:
# Preparing sample submission for Kaggle 
clf.fit(train_vectors, train_df["target"])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)

In [113]:
sample_submission = pd.read_csv("data/sample_submission.csv")

In [114]:
len(sample_submission["target"])

3263

In [115]:
len(clf.predict(test_vectors))

7613

In [117]:
sample_submission["target"] = clf.predict(test_vectors)

ValueError: Length of values does not match length of index

In [57]:
sample_submission.head()


Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [None]:
#sample_submission.to_csv("submission.csv", index=False)