In [1]:
import pandas as pd
import numpy as np
import re
import datetime
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb



In [3]:
train = pd.read_csv('/home/ubuntu/hackerearth_recruitment/train.csv')
test = pd.read_csv('/home/ubuntu/hackerearth_recruitment/test.csv')

In [4]:
ids=test['project_id']

In [5]:
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))


In [6]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

for i in range(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)

In [7]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()

test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()

In [8]:
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()

test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()

In [9]:
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))

In [11]:
time_lc = []
time_dl = []
for i in range(train.shape[0]):
    time_lc.append(np.round((train.loc[i, 'launched_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))
    time_dl.append(np.round((train.loc[i, 'deadline'] - train.loc[i, 'launched_at']).total_seconds()).astype(int))

In [12]:
train['time_lc'] = np.log(time_lc)
train['time_dl'] = np.log(time_dl)

In [13]:
time_lc = []
time_dl = []
for i in range(test.shape[0]):
    time_lc.append(np.round((test.loc[i, 'launched_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))
    time_dl.append(np.round((test.loc[i, 'deadline'] - test.loc[i, 'launched_at']).total_seconds()).astype(int))

In [14]:
test['time_lc'] = np.log(time_lc)
test['time_dl'] = np.log(time_dl)

In [15]:
feat = ['disable_communication','country']

for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x].values) + list(test[x].values))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x]))

In [16]:
train['goal'] = np.log1p(train['goal'])
test['goal'] = np.log1p(test['goal'])

In [17]:
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)

In [18]:
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1
kickdesc = kickdesc.map(desc_clean)

In [19]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]

stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]

kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]

kickdesc = [' '.join(x) for x in kickdesc]

In [20]:
cv = CountVectorizer(max_features=650)

In [21]:
alldesc = cv.fit_transform(kickdesc).todense()

In [22]:
combine = pd.DataFrame(alldesc)
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)

In [23]:
train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]

test_text.reset_index(drop=True,inplace=True)

In [24]:
cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time_lc','time_dl','goal']

In [25]:
target = train['final_status']

In [26]:
train = train.loc[:,cols_to_use]
test = test.loc[:,cols_to_use]

In [27]:
X_train = pd.concat([train, train_text],axis=1)
X_test = pd.concat([test, test_text],axis=1)

In [28]:
print X_train.shape
print X_test.shape

(108129, 659)
(63465, 659)


In [29]:
dtrain = xgb.DMatrix(data=X_train, label = target)
dtest = xgb.DMatrix(data=X_test)

In [32]:
params = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'eta':0.027,
    'max_depth':6,
    'subsample':0.72,
    'colsample_bytree':0.68,
    'min_child_weight':5
    
}

In [33]:
bst = xgb.cv(params, dtrain, num_boost_round=1050, early_stopping_rounds=40,nfold=5L,verbose_eval=10)

[0]	train-error:0.312321+0.0010311	test-error:0.314793+0.0017622
[10]	train-error:0.309149+0.0009354	test-error:0.312564+0.00364424
[20]	train-error:0.308833+0.000833894	test-error:0.312157+0.00278829
[30]	train-error:0.30763+0.000813312	test-error:0.3115+0.00259392
[40]	train-error:0.306354+0.000558816	test-error:0.310372+0.00303008
[50]	train-error:0.304291+0.000783419	test-error:0.308476+0.00228582
[60]	train-error:0.302354+0.000391137	test-error:0.30769+0.00206937
[70]	train-error:0.300788+0.00034748	test-error:0.306802+0.00181365
[80]	train-error:0.299406+0.000297243	test-error:0.305785+0.0016786
[90]	train-error:0.297954+0.000327124	test-error:0.304462+0.00176786
[100]	train-error:0.296765+0.00042665	test-error:0.303547+0.00159163
[110]	train-error:0.295901+0.000309913	test-error:0.303001+0.00162185
[120]	train-error:0.294705+0.000495821	test-error:0.301975+0.00170372
[130]	train-error:0.293838+0.000450899	test-error:0.300994+0.00182817
[140]	train-error:0.292867+0.000448504	test

In [34]:
model = xgb.train(params, dtrain, num_boost_round=1000)

In [35]:
predictions = model.predict(dtest)

In [38]:
sub = pd.DataFrame()
sub['project_id'] = ids
sub['final_status'] = predictions

In [39]:
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]

In [40]:
sub.to_csv("check.csv",index=False) #0.70