In [None]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
%matplotlib inline

# Automated feature engineering
import featuretools as ft

# Machine learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

from pathlib import Path

from IPython.display import display

# The Answer to the Ultimate Question of Life, the Universe, and Everything.
np.random.seed(42)

In [None]:
DATASET = Path("../input/data_processed_2.csv")

# Read into data frame
dataset = pd.read_csv(DATASET, header=0, sep=',', quotechar='"', parse_dates=['DrawDate'], dtype={'PrizeType': str})

df = dataset.copy()

In [None]:
df.columns

In [None]:
columns = ['DrawNo', 'DrawDate', 'PrizeType', '1st_digit', '2nd_digit', '3rd_digit', '4th_digit', 'LuckyNo']
df = df[columns]
print(df.shape)
df.head(10)

In [None]:
df.rename(columns={"DrawNo": "draw_no", "DrawDate": "draw_date", "PrizeType": "prize_type", "LuckyNo": "lucky_no" }, inplace=True)
df.info()

In [None]:
df['number_id'] = df.groupby(['lucky_no']).ngroup()
df.loc[lambda df: df['number_id'] == 1346]

In [None]:
df = df[["number_id", "draw_no", "draw_date", "prize_type", '1st_digit', '2nd_digit', '3rd_digit', '4th_digit', "lucky_no"]]
df.head(3)

## Making Labels

In [None]:
# Start year to generate the labels
START_YEAR = 2014

def make_cutoffs(draw_date):
  all_numbers = df[df['draw_date'] <= draw_date]['number_id'].unique()
  matched_numbers = df[df['draw_date'] == draw_date]['number_id'].unique()
  df_all = pd.DataFrame({'number_id': all_numbers, 'lucky_no': all_numbers})
  df_all['label'] = (df_all['number_id'].isin(matched_numbers)).astype(int)
  
  # The cutoff time is the draw_date
  df_all['cutoff_time'] = pd.to_datetime(draw_date)
  df_all = df_all[['number_id', 'cutoff_time', 'lucky_no', 'label']]
  return df_all     

In [None]:
# Testing
df_draw = make_cutoffs(pd.datetime(2014, 1, 4))
print(df_draw.loc[df_draw['label'] ==1].count())

plt.figure(figsize=(8,8))
df_draw['label'].value_counts().plot.bar();
plt.title('Label distribution')

In [None]:
df.loc[df['draw_date'] == pd.datetime(START_YEAR,1,4)].sort_values(['lucky_no'])

In [None]:

# labels = pd.DataFrame()
# for dt in df.loc[df['draw_date'].dt.year >= START_YEAR]['draw_date'].unique():
#   df_draw = make_cutoffs(dt)
#   if labels.empty:
#     labels = df_draw
#   else:
#     labels = pd.concat([labels, df_draw], axis=0)
#   #print(dt, labels.shape, len(labels.loc[labels['label'] == 1]))


# print(labels.shape, len(labels.loc[labels['label'] == 1]))
# labels.to_csv('labels.csv', index=False)

# !tar cvf labels.tar labels.csv 
# !gzip labels.tar

# from google.colab import files
# files.download('labels.tar.gz')

In [None]:
df.loc[df['draw_date'].dt.year <= START_YEAR]['lucky_no'].nunique()

In [None]:
labels = pd.read_csv('../input/labels/labels.csv', header=0, sep=',', quotechar='"', parse_dates=['cutoff_time'])
print(labels.shape)
print(labels.info())

In [None]:
labels.loc[labels['cutoff_time'].dt.year == START_YEAR + 1].head(10000)
# labels[labels.isnull().any(axis=1)].head()

### Generate new features

In [None]:
df.head(10)

In [None]:
es = ft.EntitySet(id="Results")

# Add the entire data table as an entity
es.entity_from_dataframe("results",
                         dataframe=df,
                         index="results_index",
                         time_index = 'draw_date')

es['results']

In [None]:
# Create a new entity
es.normalize_entity(new_entity_id="numbers",
                    base_entity_id="results",
                    index="number_id",
                    additional_variables=["lucky_no"])
es['numbers'].df.head()

In [None]:
print(es['numbers'].df.shape)
print(es['numbers'].df.loc[lambda df: df['lucky_no'] == 19].head())

In [None]:
es['results'].df.loc[lambda df: df['number_id'] == 19].head()

In [None]:
es

#### Generate features

In [None]:
labels['number_id'] = labels['number_id'].astype(int)
labels.loc[labels['cutoff_time'] == pd.datetime(2014,1,22)].describe()

In [None]:
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='numbers',
                                       cutoff_time = labels[3569594:], verbose = 2,
                                       cutoff_time_in_index = True,
                                       chunk_size = len(labels), n_jobs = -1,
                                       max_depth = 2)

In [None]:
df.loc[df['draw_date'].dt.year == START_YEAR]

In [None]:
display(feature_matrix.head(100).T)

In [None]:
type(feature_matrix), type(feature_names)

In [None]:
feature_names

In [None]:
feature_matrix.info()

#### Correlations

In [None]:
feature_matrix = pd.get_dummies(feature_matrix).reset_index()
feature_matrix.shape

In [None]:
corrs = feature_matrix.corr().sort_values('lucky_no')
corrs['lucky_no'].head()

In [None]:
corrs['lucky_no'].dropna().tail(20)

In [None]:
g = sns.FacetGrid(feature_matrix[(feature_matrix['MAX(results.1st_digit)'] > 0) & (feature_matrix['MAX(results.1st_digit)'] < 10)],
                  hue = 'label', size = 4, aspect = 3)
g.map(sns.kdeplot, 'MAX(results.1st_digit)')
g.add_legend();
plt.title('Distribution of Digits Total by Label');

In [None]:
feature_matrix.to_csv("feature_matrix.csv", index=False)

In [None]:
myfeature = pd.read_csv("feature_matrix.csv")
myfeature.head(10)