In [1]:
import pandas as pd
import numpy as np
import math
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
train= pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [3]:
set(train.columns.tolist()) - set(test.columns.tolist())

{'backers_count', 'final_status'}

In [4]:
# Checking imbalance
train['final_status'].value_counts()

0    73568
1    34561
Name: final_status, dtype: int64

In [5]:
# Pre Proessing
train_id, test_id = train['project_id'], test['project_id']
target = train['final_status']
train = train.drop(['final_status', 'backers_count'], axis=1)

In [6]:
# Imbalance ratio
weight = float(np.sum(target == 0)) / float(np.sum(target == 1))

In [7]:
# Concating train and test into one
data = pd.concat([train, test], ignore_index=True)

# Text columns
text= data[['project_id','name', 'desc', 'keywords']]

In [8]:
cols=['project_id', 'name', 'desc', 'keywords', 'goal', 'disable_communication', 'country', 'currency', 'deadline', 'state_changed_at', 'created_at', 'launched_at']

In [9]:
data= data[cols]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171594 entries, 0 to 171593
Data columns (total 12 columns):
project_id               171594 non-null object
name                     171593 non-null object
desc                     171582 non-null object
keywords                 171594 non-null object
goal                     171594 non-null float64
disable_communication    171594 non-null bool
country                  171594 non-null object
currency                 171594 non-null object
deadline                 171594 non-null int64
state_changed_at         171594 non-null int64
created_at               171594 non-null int64
launched_at              171594 non-null int64
dtypes: bool(1), float64(1), int64(4), object(6)
memory usage: 14.6+ MB


In [None]:
import readability

def computeRead(text):
    rd = readability.Readability(text)
    score = rd.FleschKincaidGradeLevel()
    return int(score)
def ARIscore(text):
    rd = readability.Readability(text)
    score = rd.ARI()
    return float(score)
def LIXscore(text):
    rd = readability.Readability(text)
    score = rd.LIX()
    return float(score)

In [None]:
data['readscore'] = data['desc'].apply(lambda d: computeRead(str(d)))
data['ariscore'] = data['desc'].apply(lambda d: ARIscore(str(d)))
data['lixscore'] = data['desc'].apply(lambda d: LIXscore(str(d)))

data['readscoreX'] = data['name'].apply(lambda d: computeRead(str(d)))
data['ariscoreX'] = data['name'].apply(lambda d: ARIscore(str(d)))
data['lixscoreX'] = data['name'].apply(lambda d: LIXscore(str(d)))

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def compoundScore(text):
    res = analyzer.polarity_scores(text)
    return float(res['compound'])
def negSent(text):
    res = analyzer.polarity_scores(text)
    return float(res['neg'])
def posSent(text):
    res = analyzer.polarity_scores(text)
    return float(res['pos'])
def neuSent(text):
    res = analyzer.polarity_scores(text)
    return float(res['neu'])

In [None]:
data['compoundScore'] = data['desc'].apply(lambda d: compoundScore(str(d)))
data['negSent'] = data['desc'].apply(lambda d: negSent(str(d)))
data['posSent'] = data['desc'].apply(lambda d: posSent(str(d)))
data['neuSent'] = data['desc'].apply(lambda d: neuSent(str(d)))

data['compoundScoreX'] = data['name'].apply(lambda d: compoundScore(str(d)))
data['negSentX'] = data['name'].apply(lambda d: negSent(str(d)))
data['posSentX'] = data['name'].apply(lambda d: posSent(str(d)))
data['neuSentX'] = data['name'].apply(lambda d: neuSent(str(d)))

In [None]:
data = data.drop(['project_id', 'name', 'desc', 'keywords'], axis=1)

In [None]:
# Extracting attributes from dates
import datetime
data['deadline'] =  pd.to_datetime(data['deadline'], unit='s')
data['state_changed_at'] =  pd.to_datetime(data['state_changed_at'], unit='s')
data['created_at'] =  pd.to_datetime(data['created_at'], unit='s')
data['launched_at'] =  pd.to_datetime(data['launched_at'], unit='s')

# deadline
data['deadline_month'] = data['deadline'].dt.month
data['deadline_day'] = data['deadline'].dt.day
data['deadline_year'] = data['deadline'].dt.year

# state_changed_at
data['state_changed_at_month'] = data['state_changed_at'].dt.month
data['state_changed_at_day'] = data['state_changed_at'].dt.day
data['state_changed_at_year'] = data['state_changed_at'].dt.year

# created_at
data['created_at_month'] = data['created_at'].dt.month
data['created_at_day'] = data['created_at'].dt.day
data['created_at_year'] = data['created_at'].dt.year

# launched_at
data['launched_at_month'] = data['launched_at'].dt.month
data['launched_at_day'] = data['launched_at'].dt.day
data['launched_at_year'] = data['launched_at'].dt.year


# deadline- state_changed_at
data['deadline_margin']= data['deadline'].dt.date- data['state_changed_at'].dt.date
data['deadline_margin'] = data['deadline_margin'].astype('str')
data['deadline_margin']= data['deadline_margin'].apply(lambda x: x.split(' ', 1)[0])
data['deadline_margin'] = data['deadline_margin'].astype('int64')

#launched_at- created_at
data['launch_margin']= data['launched_at'].dt.date- data['created_at'].dt.date
data['launch_margin'] = data['launch_margin'].astype('str')
data['launch_margin']= data['launch_margin'].apply(lambda x: x.split(' ', 1)[0])
data['launch_margin'] = data['launch_margin'].astype('int64')

In [None]:
for col in data.columns:
    print(str(col)+':')
    print('____________________________________')
    print(data[col].value_counts())
    print('____________________________________')

In [None]:
def currency_conversion_ratio(x):
    if x=='USD':
        return 1.000       #Setting USD as unit
    if x=='GBP':
        return 1.390
    if x=='EUR':
        return 1.220
    if x=='CAD':
        return 0.800
    if x=='AUD':
        return 0.800
    if x=='SEK':
        return 0.120
    if x=='NZD':
        return 0.730
    if x=='DKK':
        return 0.160
    if x=='NOK':
        return 0.130
    if x=='CHF':
        return 1.040
    if x=='MXN':
        return 0.054
    if x=='SGD':
        return 0.760
    if x=='HKD':
        return 0.130

In [None]:
data['conversion_ratio']=data['currency'].apply(lambda x: currency_conversion_ratio(x))

In [None]:
data['goal']=data['goal']*data['conversion_ratio']
data = data.drop(['conversion_ratio', 'deadline', 'state_changed_at', 'created_at', 'launched_at'], axis=1)

In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
# Label Encoding
from sklearn import preprocessing 
for i in data.columns: 
    if data[i].dtype=='object': 
        encoder = preprocessing.LabelEncoder() 
        encoder.fit(list(data[i].values)) 
        data[i] = encoder.transform(list(data[i].values))


In [None]:
# Splitting back into train and test
train = data[:len(train)]
test  = data[len(train):]

In [None]:
# Model 4: LightGBM

In [None]:
import lightgbm as lgb
gbm = lgb.LGBMClassifier(n_estimators=2900, max_depth=3, subsample=0.7, colsample_bytree= 0.7, scale_pos_weight=weight)
gbm = gbm.fit(train, target)

In [None]:
prediction = gbm.predict_proba(test)
prediction = prediction[:,1]
prediction[prediction>=0.5]=1
prediction[prediction<0.5]=0

In [None]:
sub = pd.DataFrame({'project_id': test_id, 'final_status': prediction})
sub = sub[['project_id', 'final_status']]
filename = 'prediction.csv'
sub.to_csv(filename, index=False)
FileLink(filename)