In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from scipy import stats
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
train_user = reduce_mem_usage(pd.read_csv('../input/train/train_user.csv', usecols=['phone_no_m', 'label']))
test_user = reduce_mem_usage(pd.read_csv('../input/test/test_user.csv', usecols=['phone_no_m']))

train_sms = reduce_mem_usage(pd.read_csv('../input/train/train_sms.csv'))
test_sms = reduce_mem_usage(pd.read_csv('../input/test/test_sms.csv'))

Memory usage of dataframe is 97824.00 MB
Memory usage after optimization is: 55082.00 MB
Decreased by 43.7%
Memory usage of dataframe is 16488.00 MB
Memory usage after optimization is: 16488.00 MB
Decreased by 0.0%
Memory usage of dataframe is 219152416.00 MB
Memory usage after optimization is: 171212853.00 MB
Decreased by 21.9%
Memory usage of dataframe is 13084928.00 MB
Memory usage after optimization is: 10222628.00 MB
Decreased by 21.9%


In [4]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

20

In [5]:
# 只取最后一个月的数据
train_sms = train_sms[train_sms['request_datetime'] >= '2020-03-01 00:00:00']

In [6]:
train_sms.columns

Index(['phone_no_m', 'opposite_no_m', 'calltype_id', 'request_datetime'], dtype='object')

In [7]:
df_sms = pd.concat([train_sms, test_sms])

del train_sms, test_sms
gc.collect()

60