# Modeling oriented EDA

**About** : This notebooks contains the EDA I had to do before starting to build models.
It is quite short but contains all the information I needed to get started.

No pointless plots, only stuff you'll need for modeling :)

In [4]:
import os
import json
import matplotlib
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold

matplotlib.rcParams['figure.figsize'] = (12, 8)

In [5]:
DATA_PATH = "../raw_data/"

In [6]:
topics = pd.read_csv(DATA_PATH + "topics.csv")
content = pd.read_csv(DATA_PATH + "content.csv")
correlations = pd.read_csv(DATA_PATH + "correlations.csv")

topics = topics[topics.has_content==True]

## Split

In [4]:
topics_train = topics[topics['category'] == "source"][["id"]]
topics_train["fold"] = "train"

In [5]:
topics_val = topics[topics['category'] != "source"].reset_index(drop=True)

sgkf = StratifiedGroupKFold(random_state=1773,
                            n_splits=4,
                            shuffle=True)
split_idxs = list(sgkf.split(topics_val["id"],
                             topics_val["language"],
                             groups=topics_val["id"]))[0]

split_idxs

(array([    0,     2,     3, ..., 25000, 25001, 25002]),
 array([    1,     5,     9, ..., 24990, 24995, 24998]))

In [6]:
topics_add_train = topics_val.iloc[split_idxs[0]].reset_index(drop=True)[["id"]]
topics_add_train["fold"] = "train"

In [7]:
topics_train = pd.concat([topics_train, topics_add_train], ignore_index=True)

In [8]:
topics_holdout = topics_val.iloc[split_idxs[1]].reset_index(drop=True)[["id"]]
topics_holdout["fold"] = "test"

In [9]:
topics[topics.id.isin(topics_train.id)].language.value_counts()

en     24045
es     10124
pt      3363
ar      3173
fr      2938
bg      2420
sw      2063
gu      1699
bn      1604
hi      1264
it       722
zh       672
mr       239
fil      163
as       112
my       110
km       104
kn        88
te        66
ur        54
or        51
ta        44
pnb       40
swa       33
pl        28
tr        26
ru        21
Name: language, dtype: int64

In [10]:
topics[topics.id.isin(topics_add_train.id)].language.value_counts()

en     12153
es      4880
bn       384
gu       321
fr       281
hi       255
pt       197
fil      163
sw        71
as        47
Name: language, dtype: int64

In [11]:
topics[topics.id.isin(topics_holdout.id)].language.value_counts()

en     4008
es     1645
bn      127
gu      110
hi      109
fr       96
pt       62
fil      61
sw       19
as       14
Name: language, dtype: int64

In [12]:
split_df = pd.concat([topics_train, topics_holdout], ignore_index=True)

In [13]:
split_df.to_csv('train_test_splits.csv', index=False)

Done !