In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
cd ..

C:\Projects\python\recommender


In [3]:
from pathlib import Path
import json

import numpy as np
import pandas as pd

In [4]:
from utils import build_logger

logger = build_logger()

In [5]:
REGS_PATH = Path("./inputs/topcoder/regs.csv")
CHA_PATH = Path("./inputs/topcoder/challenge_sm.csv")
regs_min = 4
cha_min = 4

In [6]:
regs_df = pd.read_csv(REGS_PATH)
regs_df.head()

Unnamed: 0,challengeId,registant,date
0,30044052,phead,2014-07-09
1,30044052,lovefreya,2014-07-09
2,30044052,TMALBONPH,2014-07-09
3,30044052,daga_sumit,2014-07-09
4,30044052,abcivashritt,2014-07-09


In [7]:
regs_df['date'] = pd.to_datetime(regs_df['date'], infer_datetime_format=True)
regs_df.head()

Unnamed: 0,challengeId,registant,date
0,30044052,phead,2014-07-09
1,30044052,lovefreya,2014-07-09
2,30044052,TMALBONPH,2014-07-09
3,30044052,daga_sumit,2014-07-09
4,30044052,abcivashritt,2014-07-09


In [8]:
regs_df.date.head()

0   2014-07-09
1   2014-07-09
2   2014-07-09
3   2014-07-09
4   2014-07-09
Name: date, dtype: datetime64[ns]

In [9]:
logger.info(f"Read dataset in {REGS_PATH}")
logger.info(f"Original regs shape: {regs_df.shape}")

2019-09-09 16:14:25,604 - C:\Projects\python\recommender\utils.py - INFO - Read dataset in inputs\topcoder\regs.csv
2019-09-09 16:14:25,605 - C:\Projects\python\recommender\utils.py - INFO - Original regs shape: (610025, 3)


In [10]:
regs_counts = regs_df.registant.value_counts()
chag_counts = regs_df.challengeId.value_counts()

In [11]:
logger.info(f"Original registants size: {regs_counts.size}")
logger.info(f"Original challenge size: {chag_counts.size}")

2019-09-09 16:14:26,626 - C:\Projects\python\recommender\utils.py - INFO - Original registants size: 60017
2019-09-09 16:14:26,627 - C:\Projects\python\recommender\utils.py - INFO - Original challenge size: 39916


In [12]:
regs_counts = regs_counts[regs_counts >= regs_min]
chag_counts = chag_counts[chag_counts >= cha_min]

In [13]:
logger.info(f"Filter registant size: {regs_counts.size}")
logger.info(f"Filter challenge size: {chag_counts.size}")

2019-09-09 16:14:27,463 - C:\Projects\python\recommender\utils.py - INFO - Filter registant size: 9894
2019-09-09 16:14:27,464 - C:\Projects\python\recommender\utils.py - INFO - Filter challenge size: 29494


In [14]:
# Remove sparse item
regs_df = regs_df[regs_df.registant.isin(regs_counts.index)]
regs_df = regs_df[regs_df.challengeId.isin(chag_counts.index)]

In [15]:
logger.info(f"Filter dataframe shape: {regs_df.shape}")

2019-09-09 16:14:28,275 - C:\Projects\python\recommender\utils.py - INFO - Filter dataframe shape: (533229, 3)


In [16]:
# sort datafrome by registant
regs_df = regs_df.sort_values(by=['registant', 'date'])
regs_df.head()

Unnamed: 0,challengeId,registant,date
446754,30022003,(acm)zhupeijun,2010-03-17
10253,30045145,(acm)zhupeijun,2014-08-19
16217,30045639,(acm)zhupeijun,2014-09-05
16674,30045678,(acm)zhupeijun,2014-09-08
482515,30000149,-Neo-,2006-11-16


In [17]:
regs_df['previousId'] = regs_df['challengeId']
regs_df.head()

Unnamed: 0,challengeId,registant,date,previousId
446754,30022003,(acm)zhupeijun,2010-03-17,30022003
10253,30045145,(acm)zhupeijun,2014-08-19,30045145
16217,30045639,(acm)zhupeijun,2014-09-05,30045639
16674,30045678,(acm)zhupeijun,2014-09-08,30045678
482515,30000149,-Neo-,2006-11-16,30000149


In [18]:
regs_df['previousId'] = regs_df['previousId'].shift(periods=1).fillna(0).astype('int64')
regs_df.head()

Unnamed: 0,challengeId,registant,date,previousId
446754,30022003,(acm)zhupeijun,2010-03-17,0
10253,30045145,(acm)zhupeijun,2014-08-19,30022003
16217,30045639,(acm)zhupeijun,2014-09-05,30045145
16674,30045678,(acm)zhupeijun,2014-09-08,30045639
482515,30000149,-Neo-,2006-11-16,30045678


In [19]:
# Set first item non for each user
regs_df = regs_df.sort_values(by=['registant', 'date'])
first_mask = ~regs_df.duplicated(subset=['registant'], keep='first')
first_mask.head()

446754     True
10253     False
16217     False
16674     False
482515     True
dtype: bool

In [20]:
regs_df['previousId'][first_mask] = -1
regs_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,challengeId,registant,date,previousId
446754,30022003,(acm)zhupeijun,2010-03-17,-1
10253,30045145,(acm)zhupeijun,2014-08-19,30022003
16217,30045639,(acm)zhupeijun,2014-09-05,30045145
16674,30045678,(acm)zhupeijun,2014-09-08,30045639
482515,30000149,-Neo-,2006-11-16,-1


### Add Encoder for Preprocessing

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
challenge_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
registant_encoder = OneHotEncoder(categories='auto', handle_unknown='error')

In [23]:
challenge_encoder.fit(regs_df[['challengeId']])
registant_encoder.fit(regs_df[['registant']])

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

### Split Dataset to Train and Valid Dateset

In [24]:
regs_df = regs_df.sort_values(by=['date'])
regs_df.head()

Unnamed: 0,challengeId,registant,date,previousId
482993,30000030,fastprogrammer,2006-11-04,-1
482998,30000030,fabrizyo,2006-11-04,-1
482997,30000030,chaosbreeze,2006-11-04,-1
482992,30000030,j2ee.solutions,2006-11-04,-1
482991,30000030,sharper,2006-11-04,-1


In [27]:
last_mask = regs_df.duplicated(subset=['registant'], keep='last')
remain_df = regs_df[last_mask]
test_df = regs_df[~last_mask]

In [28]:
test_df.head()

Unnamed: 0,challengeId,registant,date,previousId
482018,30000217,yuhai,2006-11-29,30000198
481976,30000229,kosmo73,2006-11-30,30000142
481988,30000227,artpro,2006-11-30,30000201
609964,30000396,despot,2006-12-07,30000231
609788,30000435,andreicsibi,2006-12-13,30000262


In [29]:
last_mask = remain_df.duplicated(subset=['registant'], keep='last')
train_df = remain_df[last_mask]
valid_df = remain_df[~last_mask]

print(train_df.shape)
print(valid_df.shape)

(513594, 4)
(9806, 4)


In [30]:
valid_df.head()

Unnamed: 0,challengeId,registant,date,previousId
482233,30000198,yuhai,2006-11-23,30000145
482318,30000186,nirmal_mehta,2006-11-23,30000153
482156,30000203,Chuande,2006-11-23,30000145
482091,30000208,artpro,2006-11-23,30000144
482140,30000204,biaochen,2006-11-23,30000143


In [31]:
train_df.head()

Unnamed: 0,challengeId,registant,date,previousId
482993,30000030,fastprogrammer,2006-11-04,-1
482998,30000030,fabrizyo,2006-11-04,-1
482997,30000030,chaosbreeze,2006-11-04,-1
482992,30000030,j2ee.solutions,2006-11-04,-1
482991,30000030,sharper,2006-11-04,-1


In [42]:
np.random.choice(registant_encoder.categories_[0], size=5)

ValueError: 'a' must be 1-dimensional or an integer

## Handle Challenge Dataframe

In [61]:
from ast import literal_eval

In [62]:
chag_df = pd.read_csv("./inputs/topcoder/challenge.csv",
                      infer_datetime_format=True,
                      converters={
                          'technologies': literal_eval,
                          'platforms': literal_eval
                      })
chag_df.head()

Unnamed: 0,challengeId,date,prizes,technologies,platforms
0,30044052,2014-07-09,1800.0,[HTML],[HTML]
1,30044053,2014-07-09,1250.0,"[Salesforce, Apex, Visualforce]","[Force.com, Salesforce.com]"
2,30044054,2014-07-09,400.0,"[Java, MySQL, REST]",[AWS]
3,30044055,2014-07-09,400.0,[iOS],[iOS]
4,30044056,2014-07-09,400.0,[iOS],[iOS]


In [63]:
chag_df['date'] = pd.to_datetime(chag_df['date'], infer_datetime_format=True)
chag_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52880 entries, 0 to 52879
Data columns (total 5 columns):
challengeId     52880 non-null int64
date            52880 non-null datetime64[ns]
prizes          52880 non-null float64
technologies    52880 non-null object
platforms       52880 non-null object
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 2.0+ MB


In [72]:
chag_df = chag_df.set_index('challengeId')
chag_df.head()

Unnamed: 0_level_0,date,prizes,technologies,platforms
challengeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30044052,2014-07-09,1800.0,[HTML],[HTML]
30044053,2014-07-09,1250.0,"[Salesforce, Apex, Visualforce]","[Force.com, Salesforce.com]"
30044054,2014-07-09,400.0,"[Java, MySQL, REST]",[AWS]
30044055,2014-07-09,400.0,[iOS],[iOS]
30044056,2014-07-09,400.0,[iOS],[iOS]


In [76]:
chag_df.loc[30044052]

date            2014-07-09 00:00:00
prizes                         1800
technologies                 [HTML]
platforms                    [HTML]
Name: 30044052, dtype: object

In [75]:
chag_df.head()

Unnamed: 0_level_0,date,prizes,technologies,platforms
challengeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30044052,2014-07-09,1800.0,[HTML],[HTML]
30044053,2014-07-09,1250.0,"[Salesforce, Apex, Visualforce]","[Force.com, Salesforce.com]"
30044054,2014-07-09,400.0,"[Java, MySQL, REST]",[AWS]
30044055,2014-07-09,400.0,[iOS],[iOS]
30044056,2014-07-09,400.0,[iOS],[iOS]


In [51]:
from sklearn.preprocessing import MultiLabelBinarizer

In [52]:
tech_binarizer = MultiLabelBinarizer(sparse_output=True)
plat_binarizer = MultiLabelBinarizer(sparse_output=True)

In [53]:
tech_binarizer.fit(chag_df['technologies'].tolist())
plat_binarizer.fit(chag_df['platforms'].tolist())

MultiLabelBinarizer(classes=None, sparse_output=True)

In [58]:
chag_df['platforms'].head()

0                         [HTML]
1    [Force.com, Salesforce.com]
2                          [AWS]
3                          [iOS]
4                          [iOS]
Name: platforms, dtype: object

In [56]:
r = plat_binarizer.transform(chag_df['platforms'].head())

In [57]:
r.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [59]:
# Convert Technologies
r = tech_binarizer.transform(chag_df['technologies'].head())
r

<5x219 sparse matrix of type '<class 'numpy.int32'>'
	with 9 stored elements in Compressed Sparse Row format>

In [60]:
r.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])