# Read Train Data

In [1]:

import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn import preprocessing
import pyspark

with open('vect2.p', 'rb') as f:
    vect = pickle.load(f)
with open('X_train_vectorized2.p', 'rb') as f:
    X_train_vectorized = pickle.load(f)
    
    

data=pd.read_csv('data_X2.csv')

data=data.rename(columns={i:i.replace('eds_model_predicted_rr.','') for i in list(data.columns)})

data['len'] = data['subject'].str.len()
number_of_splits = 8
d={}

vect_emotions = CountVectorizer(min_df=10)
vect_emotions.fit(data['emotion'])
X_emotions=vect_emotions.transform(data['emotion'])
X_train_vectorized=hstack([X_train_vectorized,X_emotions])











In [2]:
data.head(20)

Unnamed: 0.1,Unnamed: 0,subject,emotion,emotionalphrases,ispromo,read_rate,predict_industry_mapping,sender_domain,relative_uplift,relative_uplift_ind,len
0,108,"Life Coverage Rates, Save up to 70%",[],{},1,0.0,telecommunications,comcastbusiness.net,-1.0,-1.0,35
1,135,An imminent event is sending this stock price ...,[],{},1,6.25,telecommunications,comcastbusiness.net,1.106126,-0.423835,63
2,136,This stock tip is for your eyes only. The chan...,['exclusivity'],"{""tip is for your eyes only"": ""exclusivity""}",1,0.0,telecommunications,comcastbusiness.net,-1.0,-1.0,69
3,139,You can make 10x on your money by next week if...,['encouragement'],"{""You can"": ""encouragement""}",1,0.0,telecommunications,comcastbusiness.net,-1.0,-1.0,70
4,146,This company just found a huge cure and no one...,['fascination'],"{""This company just found"": ""fascination""}",1,1.56,telecommunications,comcastbusiness.net,-0.474311,-0.856189,66
5,147,Trade in the old & Save on new Windows with Ev...,[],{},1,0.0,telecommunications,comcastbusiness.net,-1.0,-1.0,52
6,164,In less than 5 days this company could yield y...,[],{},1,3.7,telecommunications,comcastbusiness.net,0.246826,-0.65891,61
7,186,>>>> INFORMATION: JOBS- AVAILABLE <<<<,['anxiety'],"{""INFORMATION"": ""anxiety""}",1,0.0,telecommunications,chello.nl,-1.0,-1.0,38
8,224,>>>> Jobs - Opportunities <<<,[],{},1,13.64,telecommunications,chello.nl,-0.005689,0.257423,29
9,285,Big Savings On Christian & Messianic Products ...,['gratification'],"{""Big Savings"": ""gratification""}",1,9.3,telecommunications,cfl.rr.com,0.732381,-0.142666,58


# Select depedent variable column, train models and save test data ready for predictions

In [3]:
column_y = 'relative_uplift'



def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

index_of_data=[i for i in range(0,len(data))]

def dummies_and_method(series):
    unique_values=list(series.unique())
    le = preprocessing.LabelEncoder()
    le.fit(unique_values)
    dummies=pd.get_dummies(unique_values+list(series))
    dummies=dummies.iloc[len(unique_values):]
    return dummies,le






for i in range(0,number_of_splits):

    model=LinearRegression()

    i_train, i_test = train_test_split(index_of_data)

    data_X = data.iloc[i_train]

    data_y = data.iloc[i_test]

    d[i]={}

    X_train_vectorized= csr_matrix(X_train_vectorized)
    X_train = X_train_vectorized[i_train,:]


    X_train = add_feature(X_train, data_X['len'])


    indust_dummies,indust_le = dummies_and_method(data_X['predict_industry_mapping'])

    for ind_dum in list(indust_dummies.columns):
        X_train = add_feature(X_train, indust_dummies[ind_dum])

    model.fit(X_train, data_X[column_y])

    d[i].update({
        'y_test':data_y[column_y],
        'subject_test':data_y['subject'],
        'model':model,
        'indust_le':indust_le,
        'vect_subject':vect,
        'vect_emotions':vect_emotions,
        'indust_test':data_y['predict_industry_mapping']
    })
pickle.dump( d, open( "d_test.p", "wb" ) )


In [4]:
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import json
from scipy.sparse import csr_matrix, hstack





url_emotions = "http://rserve.ath.persado.com:8000/emotionextract"
url_isPromo = 'https://ds-demo.persado.com/ispromo/endpoint'
HEADER = {"Content-Type": "application/json", "Accept": "application/json"}
VERIFY = False


class ORPredictor(object):
    def __init__(self,info):
        self.vect_emotions = info['vect_emotions']
        self.vect_subject = info['vect_subject']
        self.indust_le = info['indust_le']
        self.model = info['model']

    def predict(self,subject,industry):
        x = self.vect_subject.transform([subject])
        res = self.request_api(subject)
        emotion = res[1]
        isPromo=res[3]
        x = hstack([x, self.vect_emotions.transform([emotion])])
        x = hstack([x, [len(subject)]])
        x = hstack([x, self.dummies_transform(industry)])
        return self.model.predict(x)[0],isPromo

    @staticmethod
    def request_api(text):
        result = {}
        payload = {'text': text}
        try:
            res_emotions = requests.post(url_emotions, headers=HEADER, verify=VERIFY, data=json.dumps(payload)).json()
            result.update({
                'emotion': str([j for j in res_emotions['EMO_VALUES'].values()]),
                'emotionalPhrases': json.dumps(res_emotions['EMO_VALUES'])
            })
            if not result['emotion']:
                result['emotion'] = 'no_emotion'
        except:
            result.update({'emotion': 'no_emotion', 'emotionalPhrases': None, })
        try:
            res_isPromo = requests.post(url_isPromo, headers=HEADER, verify=VERIFY, data=json.dumps(payload)).json()
            result.update({'isPromo': 1 if res_isPromo['label'] == 'Promotional' else 0})
        except:
            result.update({'isPromo': None})
        return (text, result['emotion'], result['emotionalPhrases'], result['isPromo'])


    def dummies_transform(self,rec):
        l = [0 for i in range(0, len(self.indust_le.classes_))]
        l[self.indust_le.transform([rec])[0]] = 1
        return l

# Make predictions for various data sets

In [None]:

from sklearn.metrics import mean_squared_error
import pickle
from math import sqrt


with open('d_test.p', 'rb') as f:
    d_test = pickle.load(f)


import datetime


dd={}


for i,v in d_test.items():
    actual=[]
    predicted=[]
    promos=[]
    predictor = ORPredictor(info=v)
    print(60*'-')
    print('Len of data :'+str(len(v['subject_test'])))
    for actual_cr,sub,ind in zip(v['y_test'],v['subject_test'],v['indust_test']):
        t0=datetime.datetime.now()
        
        prediction,isPromo = predictor.predict(sub,ind)
        print(datetime.datetime.now()-t0)
        actual.append(actual_cr)
        predicted.append(prediction)
        promos.append(isPromo)
    dd[i]={'a':actual,'p':predicted,'promos':promos}
    print(sqrt(mean_squared_error(actual, predicted)))
pickle.dump( dd, open( "resutls.p", "wb" ) )


#     rms = 
#     print(rms)
#
#
# sns.regplot(x=actual, y=predicted, fit_reg=False)


------------------------------------------------------------
Len of data :242
0:00:01.089798
0:00:01.125488
0:00:01.271924
0:00:02.005668
0:00:01.125311
0:00:01.225706
0:00:01.536149
0:00:01.127347
0:00:00.818143
0:00:01.536315
0:00:00.920971
0:00:02.019340
0:00:01.360338
0:00:01.972233
0:00:01.001476
