In [16]:
import pandas as pd
import re
from sklearn.model_selection import cross_val_score
import operator
from nltk import ngrams
import warnings
warnings.filterwarnings('ignore')

In [17]:
narrative = {'Money': 'money now broke week until time last \
day when today tonight paid next first night after tomorrow \
month while account before long Friday rent buy bank still \
bills bills ago cash due due soon past never paycheck check \
spent years poor till yesterday morning dollars financial \
hour bill evening credit budget loan bucks deposit dollar \
current payed'.split(),'Job':'work job paycheck unemployment\
interview fired employment hired hire'.split(),'Student':'college\
student school roommate studying university finals semester class\
study project dorm tuition'.split(),'Family':'family mom wife parents\
mother hus- band dad son daughter father parent mum'.split(),'Craving':'friend \
girlfriend craving birthday boyfriend celebrate party game games movie\
date drunk beer celebrating invited drinks crave wasted invite'.split()}

polite_words = [
    "please","thanks","thank you","think", "thought", "thinking", "almost",
    "apparent", "apparently", "appear", "appeared", "appears", "approximately", "around",
    "assume", "assumed", "certain amount", "certain extent", "certain level", "claim",
    "claimed", "doubt", "doubtful", "essentially", "estimate",
    "estimated", "feel", "felt", "frequently", "from our perspective", "generally", "guess",
    "in general", "in most cases", "in most instances", "in our view", "indicate", "indicated",
    "largely", "likely", "mainly", "may", "maybe", "might", "mostly", "often", "on the whole",
    "ought", "perhaps", "plausible", "plausibly", "possible", "possibly", "postulate",
    "postulated", "presumable", "probable", "probably", "relatively", "roughly", "seems",
    "should", "sometimes", "somewhat", "suggest", "suggested", "suppose", "suspect", "tend to",
    "tends to", "typical", "typically", "uncertain", "uncertainly", "unclear", "unclearly",
    "unlikely", "usually", "broadly", "tended to", "presumably", "suggests",
    "from this perspective", "from my perspective", "in my view", "in this view", "in our opinion",
    "in my opinion", "to my knowledge", "fairly", "quite", "rather", "argue", "argues", "argued",
    "claims", "feels", "indicates", "supposed", "supposes", "suspects", "postulates"
]

req_features = [
 'karma',
 'nt_Craving',
 'nt_Family',
 'nt_Job',
 'nt_Money',
 'nt_Student',
 'politeness',
 'popularity',
 'length',
 'scores',
 'evidentiality',
 'pizza_received',
 'activity',
 'age'
]

In [18]:
def get_length(df):
    return [len(x.split()) for x in df['request_text_edit_aware']]

def get_age(df):
    return df['requester_days_since_first_post_on_raop_at_request']

def get_popularity(df):
        return df['number_of_upvotes_of_request_at_retrieval']

def get_evidentiality(df):
    urls = []
    for text in df['request_text_edit_aware']:
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        urls.append(len(url))
    return urls

def get_activity(df):
    return df['requester_number_of_subreddits_at_request']
    
def get_narrative(df,narrative):
        col='request_text_edit_aware'
        request_narrative,narration = [],[]
        for request in df[col]:
            word_count = {'Money':0,'Job':0,'Student':0,'Family':0,'Craving':0}
            n = 0
            for word in request.split():
                for lexicon in narrative:
                    if word in narrative[lexicon]:
                        word_count[lexicon] += 1
            narration.append(max(word_count.items(), key=operator.itemgetter(1))[0])
        return narration
    
def get_politeness(df,polite_words):
        count,politeness = 0,[]
        col='request_text_edit_aware'
        for request in df[col]:
            count += 1
            request_ngrams = []
            for grams in ngrams(request.split(),3):
                request_ngrams.append(' '.join(grams))
            for grams in ngrams(request.split(),2):
                request_ngrams.append(' '.join(grams))
            request_words = set(request.split())
            request_ngrams = set(request_ngrams)
            #print(request_words,request_ngrams)
            num = len(request_words.intersection(set(polite_words))) +  len(request_ngrams.intersection(set(polite_words)))
            try:
                politeness.append(float(num)/len(request_words))
            except:
                politeness.append(0.0)
        return politeness
    
def train_model(data,model,response_col,scoring='roc_auc',cv=5):
        data_copy = data.copy()
        y = data_copy.pop(response_col)
        scores = cross_val_score(model,data_copy,y,scoring=scoring,cv=cv)
        del data_copy
        return scores.mean()

In [19]:
df = pd.read_json('train.json')

In [20]:
df['length'] = get_length(df)
df['age']= get_age(df)
df['evidentiality'] = get_evidentiality(df)
df['activity'] = get_activity(df)
narrative_topics = get_narrative(df,narrative=narrative)
df['nt'] = narrative_topics
df = pd.get_dummies(df,columns=['nt'])
df['popularity']= get_popularity(df)
df['politeness']= get_politeness(df,polite_words)
df['pizza_received']= df['requester_received_pizza']
#print(df['pizza_received'])

In [21]:
df_new = df.drop([col for col in df.columns if col not in req_features],axis=1)
df_new.head(20)

Unnamed: 0,length,age,evidentiality,activity,nt_Craving,nt_Family,nt_Job,nt_Money,nt_Student,popularity,politeness,pizza_received
0,67,0.0,0,0,0,1,0,0,0,1,0.0,False
1,16,0.0,0,12,0,0,0,1,0,5,0.0,False
2,59,0.0,0,0,0,0,0,1,0,3,0.0,False
3,30,0.0,0,4,0,1,0,0,0,1,0.0,False
4,103,101.606505,0,11,0,0,0,1,0,6,0.025,False
5,34,340.819329,0,21,0,0,0,1,0,4,0.066667,True
6,207,0.0,1,0,0,0,0,1,0,2,0.0,False
7,87,0.0,0,15,0,0,0,1,0,6,0.029851,False
8,47,0.0,0,20,0,0,0,1,0,1,0.02381,False
9,59,0.0,0,1,0,0,0,1,0,6,0.018519,True


In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
auc_score = train_model(df_new,model=model,response_col='pizza_received',cv=3)
acc_score = train_model(df_new,model=model,response_col='pizza_received',cv=3,scoring='accuracy')

In [24]:
print(auc_score*100,"%")
print(acc_score*100,"%")

65.12273582939083 %
75.34644908973065 %
