In [23]:
import os
import random
import gzip

# data
import pandas as pd
import numpy as np

# visualisation
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

# sklearn
import sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn import preprocessing
from sklearn.preprocessing import normalize, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import KFold, LeavePOut, LeaveOneOut, \
    ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit, GroupKFold, \
    train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error, log_loss
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator


import warnings
warnings.filterwarnings("ignore")

#### Функции

In [26]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [31]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

#### Раскладываем .csv файл по .pkl, параллельно уменьшая их размер

In [33]:
for i, chunk in enumerate(pd.read_csv(r'input/train.tar.gz', sep=';', 
                         quotechar='"', compression='gzip', chunksize=300000), start=1):
    print(i)
    reduce_mem_usage(chunk).to_pickle(fr'input/train_pickles/{i}.pkl')

1
Mem. usage decreased to 18.60 Mb (57.2% reduction)
2
Mem. usage decreased to 18.60 Mb (57.2% reduction)
3
Mem. usage decreased to 18.60 Mb (57.2% reduction)
4
Mem. usage decreased to 18.60 Mb (57.2% reduction)
5
Mem. usage decreased to 18.60 Mb (57.2% reduction)
6
Mem. usage decreased to 18.60 Mb (57.2% reduction)
7
Mem. usage decreased to 18.60 Mb (57.2% reduction)
8
Mem. usage decreased to 19.17 Mb (55.9% reduction)
9
Mem. usage decreased to 19.17 Mb (55.9% reduction)
10
Mem. usage decreased to 19.17 Mb (55.9% reduction)
11
Mem. usage decreased to 19.17 Mb (55.9% reduction)
12
Mem. usage decreased to 19.17 Mb (55.9% reduction)
13
Mem. usage decreased to 19.17 Mb (55.9% reduction)
14
Mem. usage decreased to 19.17 Mb (55.9% reduction)
15
Mem. usage decreased to 19.17 Mb (55.9% reduction)
16
Mem. usage decreased to 19.17 Mb (55.9% reduction)
17
Mem. usage decreased to 19.17 Mb (55.9% reduction)
18
Mem. usage decreased to 19.17 Mb (55.9% reduction)
19
Mem. usage decreased to 19.17 Mb (

#### Генерируем рандомный сэмпл из файлов

In [34]:
file_nums = np.random.randint(1, 100, 2)

train_df = pd.DataFrame()
for file_num in file_nums:
    train_df = pd.concat([train_df, pd.read_pickle(fr'input/train_pickles/{file_num}.pkl')], axis=0)

In [36]:
train_df.shape

(600000, 19)

In [37]:
resumetable(train_df)

Dataset Shape: (600000, 19)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,train.csv,int32,0,4126,1379318748,1379318748,1379318748,11.99
1,label,int8,0,2,0,0,0,0.02
2,C1,int64,0,93443,4142269800,3003633464,3303640266,15.3
3,C2,int64,0,30322,2277192293,1689103729,705668221,12.39
4,C3,int16,0,3925,2081,5800,496,9.11
5,C4,int32,0,14394,21050,6153,27063,11.23
6,C5,int8,0,88,35,95,30,4.3
7,C6,int16,0,1960,995,3905,995,1.63
8,C7,int8,0,3,0,0,2,1.01
9,C8,int16,0,282,468,147,452,5.15


In [38]:
train_df.label.value_counts()

0    598773
1      1227
Name: label, dtype: int64