In [1]:
import pandas as pd
import numpy as np
import re
import time

pd.set_option('max.columns', 131)

In [2]:
df = pd.read_csv('raw_data_with_labels.csv')
df = df[df['y'].notnull()]
df.shape

(91, 4)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']

In [5]:
df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d')
df['view_count'] = df['view_count'].astype(int)
df_limpo = df.copy()
df_limpo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 0 to 231
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        91 non-null     object        
 1   upload_date  91 non-null     datetime64[ns]
 2   view_count   91 non-null     int64         
 3   y            91 non-null     float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 3.6+ KB


In [6]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [7]:
features['tempo_desde_pub'] = (pd.to_datetime('2021-01-31') - df_limpo['upload_date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['view_count']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [8]:
Xtrain, Xval = features.iloc[:45], features.iloc[45:]
ytrain, yval = y.iloc[:45], y.iloc[45:]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((45, 2), (46, 2), (45,), (46,))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo.iloc[:45]['title']
title_val = df_limpo.iloc[45:]['title']

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [10]:
title_bow_train.shape

(45, 44)

In [11]:
from scipy.sparse import hstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [12]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=3)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=3,
                       random_state=0)

In [13]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [14]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [15]:
average_precision_score(yval, p)

0.41925805072356803

In [16]:
roc_auc_score(yval, p)

0.45039682539682546

# Active Learning

In [17]:
df_unlabeled = pd.read_csv('raw_data_with_labels.csv')
df_unlabeled = df_unlabeled[df_unlabeled['y'].isnull()].dropna(how='all')
df_unlabeled.shape

(209, 4)

In [18]:
df_unlabeled = df_unlabeled[df_unlabeled['upload_date'].notnull()]
df_unlabeled = df_unlabeled[df_unlabeled['view_count'].notnull()]

In [19]:
df_unlabeled.head(1)

Unnamed: 0,title,upload_date,view_count,y
29,Hacker News | New Machine Learning Model predi...,2021-01-29,3.0,


In [20]:
df_limpo_u = pd.DataFrame(index=df_unlabeled.index)
df_limpo_u['title'] = df_unlabeled['title']

In [21]:
df_unlabeled['upload_date'] = pd.to_datetime(df_unlabeled['upload_date'], format='%Y-%m-%d')
df_unlabeled['view_count'] = df_unlabeled['view_count'].astype(int)
df_limpo_u = df_unlabeled.copy()
df_limpo_u.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 29 to 299
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        208 non-null    object        
 1   upload_date  208 non-null    datetime64[ns]
 2   view_count   208 non-null    int64         
 3   y            0 non-null      float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 8.1+ KB


In [27]:
features_u = pd.DataFrame(index=df_limpo_u.index)

In [28]:
features_u['tempo_desde_pub'] = (pd.to_datetime('2021-01-31') - df_limpo_u['upload_date']) / np.timedelta64(1, 'D')
features_u['views'] = df_limpo_u['view_count']
features_u['views_por_dia'] = features_u['views'] / features_u['tempo_desde_pub']
features_u = features_u.drop(['tempo_desde_pub'], axis=1)

In [29]:
title_u = df_limpo_u['title']
title_bow_u = title_vec.transform(title_u)

In [30]:
title_bow_u

<208x44 sparse matrix of type '<class 'numpy.float64'>'
	with 652 stored elements in Compressed Sparse Row format>

In [31]:
Xu_wtitle = hstack([features_u, title_bow_u])
Xu_wtitle

<208x46 sparse matrix of type '<class 'numpy.float64'>'
	with 1056 stored elements in COOrdinate format>

In [33]:
pu = mdl.predict_proba(Xu_wtitle)[:, 1]
df_unlabeled['p'] = pu
df_unlabeled.head(1)

Unnamed: 0,title,upload_date,view_count,y,p
29,Hacker News | New Machine Learning Model predi...,2021-01-29,3,,0.047


In [36]:
mask_u = (df_unlabeled['p'] >= .4) & (df_unlabeled['p'] <= .60)
mask_u.sum()

71

In [37]:
df_unlabeled[mask_u]

Unnamed: 0,title,upload_date,view_count,y,p
45,Deep Learning at Scale with Horovod feat. Trav...,2021-01-28,149,,0.404861
49,"Sustainability, Machine Learning, AR/VR and 5G...",2021-01-28,7,,0.481622
53,How to Deploy Machine Learning Models using Po...,2021-01-28,339,,0.404000
59,Flutter iOS & Android Object Detection with Te...,2021-01-28,1643,,0.409452
66,Machine Learning for the Environment - Mining ...,2021-01-27,53,,0.582913
...,...,...,...,...,...
295,Interview with Kaggle Grandmaster Nima Shahbaz...,2020-10-13,1636,,0.454372
296,Interview with Competitions Grandmaster 詹 金 se...,2020-10-13,920,,0.479247
297,Más Kaggle Notebooks y Pytorch Lightning,2020-11-04,132,,0.411061
298,Interview with Competitions Grandmaster Julian...,2020-10-13,1226,,0.454372


In [38]:
hardest = df_unlabeled[mask_u]
random = df_unlabeled[~mask_u].sample(31)
pd.concat([hardest, random]).to_csv('active_label1.csv')