In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.externals import joblib
import pickle

In [2]:
data = pd.read_csv('data.csv',escapechar="\\")

In [3]:
data_sub = data[['title', 'ups']]

In [4]:
data_sub = data_sub[np.abs((data_sub.ups - data_sub.ups.mean()) <=(2*data_sub.ups.std()))]
msk = np.random.rand(len(data_sub)) < 0.8
train_set = data_sub[msk]
test_set = data_sub[~msk]
test_titles = test_set['title']
test_labels = test_set['ups']
train_titles = train_set['title']
train_labels = train_set['ups']

In [5]:
count_vectorizer = CountVectorizer()
bow_transformer = count_vectorizer.fit(train_titles)
pickle.dump(bow_transformer, open('bow_transformer_v1.pkl', 'wb'), protocol=2)
train_titles_bow = bow_transformer.transform(train_titles)
test_titles_bow = bow_transformer.transform(test_titles)

In [6]:
tfidf_transformer = TfidfTransformer().fit(train_titles_bow)
pickle.dump(tfidf_transformer, open('tfidf_transformer_v1.pkl','wb'), protocol=2)
train_titles_tfidf = tfidf_transformer.transform(train_titles_bow)
test_titles_tfidf = tfidf_transformer.transform(test_titles_bow)

In [7]:
svd = TruncatedSVD()
lsa = make_pipeline(svd, Normalizer(copy=False))
X_train_lsa = lsa.fit_transform(train_titles_tfidf)
pickle.dump(lsa, open('lsa_v1.pkl', 'wb'), protocol=2)
X_test_lsa = lsa.transform(test_titles_tfidf)


In [8]:
reg = Ridge()
reg.fit(X_train_lsa, train_labels)


train_predictions = reg.predict(X_train_lsa)
test_predictions = reg.predict(X_test_lsa)
score = reg.score(X_test_lsa, test_labels)

In [9]:
pickle.dump(reg, open('prediction_model_v1.pkl','wb'), protocol=2)

In [10]:
cross_val_score(reg,X_train_lsa, train_labels, scoring='mean_absolute_error')

array([-422.0925304 , -418.55767351, -426.90402787])

In [11]:
def apply(input):
    path1 = "bow_transformer_v1.pkl"
    path2 = "tfidf_transformer_v1.pkl"
    path3 = "lsa_v1.pkl"
    path4 = "prediction_model_v1.pkl"
    bow = pickle.load(open(path1, 'rb'))
    bow_tf = bow.transform([input])
    tfidf = pickle.load(open(path2,'rb'))
    tfidf_tf = tfidf.transform(bow_tf)
    lsa = pickle.load(open(path3,'rb'))
    lsa_tf = lsa.transform(tfidf_tf)
    model4 = pickle.load(open(path4,'rb'))
    prediction = model4.predict(lsa_tf).tolist()
    return prediction

In [12]:
apply("Donald Trump defends paper towels in Puerto Rico, says Stephen Paddock was ‘probably smart’ in bizarre TV interview: Analysis")

[262.11170429776087]

In [13]:
print(train_labels[0])
print(X_train_lsa[0])

0
[ 0.90090188  0.43402282]
