# YAKC - Clean

## Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

DEBUG = False
SEED = 666

import os
for d in ['orig','data','output']: os.makedirs(d, exist_ok=True)


## Datasets

#### Authors

In [3]:
basename = "authors"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_authors = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_authors = pd.read_csv(f"orig/{basename}.csv.gz")
    # cleaning
    print(df_authors.shape, end=" ... ")
    df_authors.rename(columns={"QAuthor":"author"}, inplace=True)
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_authors.to_pickle(f"data/{basename}.pickle")

print(df_authors.shape)
df_authors.head(5)

Reading csv ... (13182, 3) ... generating pickle file ... (13182, 3)


Unnamed: 0,author,isUQ,status
0,aa65b7dd5d5fa660d058e094669f884bf7d52299,0,Active
1,2b1505f289338751829dfa129c0b52d145c9eceb,1,Active
2,4eeddb9abeb3c4889f1b037016bf2aeb834bb66d,0,Active
3,08a6fae5a56fcdb495b2de8a02625ea2b4abe32f,1,Active
4,01301652214982c57efe894efe7e2c7d57df2801,0,Active


#### Comments

In [4]:
basename = "comments"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_comments = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_comments = pd.read_csv(f"orig/{basename}.csv.gz", dtype=str, parse_dates=["date_created"])
    # cleaning
    print(df_comments.shape, "dropna", end=" ... ")
    df_comments.dropna(inplace=True)
    print(df_comments.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_comments.to_pickle(f"data/{basename}.pickle")

print(df_comments.shape)
df_comments.head(5)

Reading csv ... (11394929, 7) dropna ... (10831841, 7) ... generating pickle file ... (10831841, 7)


Unnamed: 0,id,link_id,parent_id,author,subreddit,body,date_created
0,e0mztbn,t3_8qy7gp,t3_8qy7gp,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,My account is new because i lost my password t...,2018-06-14 02:17:37
1,e0n0e9q,t3_8qy9wy,t3_8qy9wy,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,new account only because i lost the password t...,2018-06-14 02:28:21
2,e0n11j4,t3_8qy9wy,t3_8qy9wy,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,i appreciate all the comments i'll read thru t...,2018-06-14 02:40:09
3,e0n1q4v,t3_8qy9wy,t1_e0n0vfm,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,why would rosenstein threaten those asking (li...,2018-06-14 02:52:41
4,e0n1u7v,t3_8qy9wy,t1_e0n0ltk,182c774799aac38a84f5117fc59cde99b0df19af,greatawakening,interesting i'm reading now. i'm still confuse...,2018-06-14 02:54:46


#### Submissions

In [2]:
basename = "submissions"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_comments = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_comments = pd.read_csv(f"orig/{basename}.csv.gz", dtype=str, parse_dates=["date_created"])
    subset_labels = df_comments.drop(['text'], axis=1).columns.values.tolist()
    # cleaning
    print(df_comments.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_comments.dropna(inplace=True, subset=subset_labels)
    print(df_comments.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_comments.to_pickle(f"data/{basename}.pickle")

print(df_comments.shape)
df_comments.head(5)

Reading csv ... (2775263, 13) dropna ... (2099686, 13) ... generating pickle file ... (2099686, 13)


Unnamed: 0,subreddit,id,score,numReplies,author,title,text,is_self,domain,url,permalink,upvote_ratio,date_created
0,greatawakening,8xuv4i,1,14,879f283b831c13474e219e88663d95b0763cca9b,I’ve been writing “Trump Lives Here” on my $20...,,False,i.redd.it,https://i.redd.it/h3mbbxvxq7911.jpg,/r/greatawakening/comments/8xuv4i/ive_been_wri...,-1.0,2018-07-11 00:27:24
1,greatawakening,8ydw3e,1,13,879f283b831c13474e219e88663d95b0763cca9b,Trying to take him seriously but...,,False,i.redd.it,https://i.redd.it/62gaw0th4l911.jpg,/r/greatawakening/comments/8ydw3e/trying_to_ta...,-1.0,2018-07-12 21:26:32
2,greatawakening,8ytwg0,1,0,879f283b831c13474e219e88663d95b0763cca9b,“It is all happening!” Crumb?,,False,i.redd.it,https://i.redd.it/yo9zscb1jx911.jpg,/r/greatawakening/comments/8ytwg0/it_is_all_ha...,-1.0,2018-07-14 15:09:25
3,greatawakening,8ytx4z,1,114,879f283b831c13474e219e88663d95b0763cca9b,“It is all happening!” Positive sign hopefully...,,False,i.redd.it,https://i.redd.it/v5c4zxcjjx911.jpg,/r/greatawakening/comments/8ytx4z/it_is_all_ha...,-1.0,2018-07-14 15:12:14
4,greatawakening,8yvgwt,1,23,879f283b831c13474e219e88663d95b0763cca9b,Pedogate is REAL! Happening here in my beloved...,,False,foxnews.com,http://www.foxnews.com/us/2018/07/14/texas-wom...,/r/greatawakening/comments/8yvgwt/pedogate_is_...,-1.0,2018-07-14 18:46:35


#### Subreddits

In [4]:
basename = "subreddits"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_comments = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_comments = pd.read_csv(f"orig/{basename}.csv.gz", dtype=str)
    # cleaning
    print(df_comments.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_comments.dropna(inplace=True)
    print(df_comments.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_comments.to_pickle(f"data/{basename}.pickle")

print(df_comments.shape)
df_comments.head(5)

Reading csv ... (12987, 38) dropna ... (12987, 38) ... generating pickle file ... (12987, 38)


Unnamed: 0,subreddit,numSubscribers,status,allModNames,allMods,qModNames,qMods,top_qModNames,top_qMods,firstPostSubmission,lastPostSubmission,firstPostComment,lastPostComment,qModsRatio,top_qModsRatio,activePreBanOnly,activePreQ,activePostBan,qAuth,top_qAuth,qSubmissions,top_qSubmissions,nonTop_qSubmissions,qComments,top_qComments,nonTop_qComments,top_qPercent,qPercent,Monthly Average Total Authors,Monthly Average Total Submissions,Monthly Average UQ Authors,Monthly Average UQ Submissions,Monthly Average QAnon Authors,Monthly Average QAnon Submissions,% UQ Submissions,% UQ Authors,% QAnon Submissions,% QAnon Authors
0,Watches,1525243.0,public,"['f1c355408b78fd88ebc13aade4c9a7924005c2ab','e...",13,[],0.0,[],0.0,2016-12-07 03:21:16,2020-11-27 19:24:49,2016-11-26 13:24:54,2021-01-22 18:22:43,0.0,0.0,0,1,1,58,10,219,99,120,1681.0,244.0,1437.0,0.29,0.44,2881.3846153846152,5911.846153846154,1.2,2.4,3.75,6.0,0.0405964556171441,0.0416466442415505,0.1014911390428604,0.1301457632548454
1,MMA,1518451.0,public,"['69e403df92bb49af60d5046c0be60f1a46bfd53d','d...",20,[],0.0,[],0.0,2016-11-01 05:51:55,2021-01-03 00:43:44,2016-10-28 00:39:15,2021-01-23 08:16:57,0.0,0.0,0,1,1,97,29,371,99,272,18910.0,2808.0,16102.0,0.83,0.74,1840.4615384615383,4953.461538461538,2.9166666666666665,4.75,6.384615384615385,9.923076923076923,0.0958925382405466,0.1584747415642676,0.2003261122757978,0.3469029507648583
2,Seattle,281450.0,public,"['14a0b21d55a0415e5541be3389d27ee3b3232c90','f...",7,[],0.0,[],0.0,2016-11-07 07:12:42,2021-01-18 23:13:14,2016-11-03 16:45:54,2021-01-21 14:56:00,0.0,0.0,0,1,1,53,13,188,99,89,1559.0,214.0,1345.0,0.37,0.4,732.7692307692307,1140.6923076923076,1.2,1.2,3.272727272727273,3.8181818181818175,0.1051992716973498,0.1637623346630275,0.3347249554006584,0.4466245490809841
3,UnusAnnus,195890.0,quarantined/private,['f7fd9c68f804acda665d2ab082217bb1583318f2'],1,[],0.0,[],0.0,2020-02-05 05:04:36,2020-11-14 05:24:35,2020-06-17 23:26:32,2020-11-13 09:04:06,0.0,0.0,0,0,1,1,1,99,99,0,3.0,0.0,0.0,0.03,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,GlobalOffensiveTrade,209007.0,public,"['f78b47357eabba6f7580e47da7560a322287eaee','9...",22,[],0.0,[],0.0,2016-12-02 06:33:48,2020-08-30 21:38:25,2017-04-05 12:44:22,2019-12-19 01:06:07,0.0,0.0,0,1,1,6,1,122,99,23,86.0,46.0,40.0,0.03,0.05,2132.923076923077,17479.30769230769,1.0,19.0,1.0,19.0,0.1086999573121625,0.0468840161569532,0.1086999573121625,0.0468840161569532


#### Paper

In [3]:
basename = "paper"

if os.path.isfile(f"data/{basename}.pickle"):
    # loading
    print("Reading pickle file", end=" ... ")
    df_comments = pd.read_pickle(f"data/{basename}.pickle")
else:
    # loading
    print("Reading csv", end=" ... ")
    df_comments = pd.read_csv(f"orig/{basename}.csv", dtype=str)
    # cleaning
    print(df_comments.shape, "dropna", end=" ... ")
    
    # Make sure text is ignored when dropping na
    df_comments.dropna(inplace=True)
    print(df_comments.shape, end=" ... ")
    # save as pickle for later use
    print("generating pickle file", end=" ... ")
    df_comments.to_pickle(f"data/{basename}.pickle")

print(df_comments.shape)
df_comments.head(5)

Reading csv ... (13182, 3) ... generating pickle file ... (13182, 3)


Unnamed: 0,author,isUQ,status
0,aa65b7dd5d5fa660d058e094669f884bf7d52299,0,Active
1,2b1505f289338751829dfa129c0b52d145c9eceb,1,Active
2,4eeddb9abeb3c4889f1b037016bf2aeb834bb66d,0,Active
3,08a6fae5a56fcdb495b2de8a02625ea2b4abe32f,1,Active
4,01301652214982c57efe894efe7e2c7d57df2801,0,Active


In [8]:
df_train.dtypes.value_counts()

float64    178
int64       29
dtype: int64

In [13]:
df_train.isna().sum().sum()

6038

#### Train

In [5]:
df_test = pd.read_csv("orig/test.csv")
print(df_test.shape)
df_test

(6265, 206)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163,x164,x165,x166,x167,x168,x169,x170,x171,x172,x173,x174,x175,x176,x177,x178,x179,x180,x181,x182,x183,x184,x185,x186,x187,x188,x189,x190,x191,x192,x193,x194,x195,x196,x197,x198,x199,x200,x201,x202,x203,x204,x205
0,3,4.2241,1,0.0000,23.5068,29.6787,0.450,1.980838,0.163172,16295.85,0.00,5,1.0945,0.0533,0.1333,0.430486,0.000064,-0.552766,15.8210,30.8575,0.000,0.000000,3518.1899,0.000000,0.0000,0.000,0.0000,-2.669028,0.724260,11.5360,0.0000,1.363201,0.0000,2,0.00,0.4409,0.000,0.0,4,0.0000,0.0000,0.0000,0.00,0.292289,0.0000,0.0000,0.0000,90.000,990.09,0.0000,0.027496,0.0,0.0000,1,0.000,2,1.1065,0.72,-0.383887,3,0.000000,60.0000,-0.999931,0.0676,2.405041,67.0820,1.0,5,0.0,,0.0133,522.5555,,0.084792,0.0000,0.252765,0.0,0.1747,0.0000,0.00,0.0000,2.340,-4.421815,0.54,0.0000,0.0000,0.000000,44.1145,0.00,0.257967,0.0000,0.0711,1.1169,0.0000,0.0424,2.5000,0.000000,3671.37,-1.765610,1.44,1.1022,3,2,1,0.0000,0.0000,216.0818,1,0.0000,0.0,0.0,0.0,618.66,0.0,0.000216,2.4,0.0000,0.0000,10.9480,1,0.00,0.0000,3,-0.101281,0.0000,5,4.4667,0.0000,5,-0.099349,0.0000,0.000000,0.0,1.1216,3,0.000,0.0000,0.0,0.00,0.00,18.1667,14.0200,0.0000,2,0.00,1.980838,-4.421815,0.0000,0,3707.0339,0.255978,1.562013,1.0958,-0.211940,0.0000,0.0000,0,2.405041,0.1786,1.1440,0.0000,0.00,0.0000,1907.82,0.0000,2.883239,0.0000,2,58.005,0.0356,-2.337341,1,0.0000,0.412650,-4.120612,0.0000,2021,0.000,227.9023,-0.383887,0.000000,2,0.0000,0,995,0.0,0.0,-2.854167,1.2489,5,1.875138,0.0000,4.86,1.518522,0.292289,0.0000,0.99,19.4360,1.0699,3.0,0.0000,1.0,0.00,0.000,0,0.000000
1,3,0.0000,1,1.3244,6.6164,0.0000,0.540,-2.606239,0.000000,0.00,0.00,7,0.0000,0.0160,0.0000,-0.835329,0.000024,-0.561637,6.5965,9702.2157,0.765,0.000000,0.0000,0.000000,0.0000,0.000,0.0000,-0.999451,0.000000,0.0000,0.4960,1.520167,0.0000,3,2.07,0.0499,0.000,0.0,3,0.0000,0.0044,0.0000,0.00,-0.457013,7.2692,1.1671,0.0000,60.000,1465.89,0.0000,0.003376,0.0,0.0000,1,0.000,2,0.0000,905.58,1.763969,3,0.000676,0.0000,-1.396045,0.0560,0.625527,0.0000,1.0,4,0.0,0.0,0.0044,0.0000,2.0,0.974472,0.0000,-0.245821,0.0,0.0000,0.0000,0.00,0.0000,0.000,-2.442345,0.00,0.0000,0.0000,0.000000,5.2542,0.00,-1.137698,0.0000,0.0000,1.1557,0.0000,0.0000,1.2000,0.021360,0.00,-1.965061,0.54,0.0044,5,3,1,2.2000,0.0587,0.0000,1,0.1022,0.0,0.0,0.0,75.96,0.0,0.000000,1.3,1.0831,0.3147,0.0000,1,0.00,0.0000,4,-1.083037,0.0000,5,0.0000,0.0000,3,0.018753,0.0000,0.000000,0.0,1.1347,3,2.070,0.0000,0.0,0.00,15.21,14.8267,0.0000,0.0000,2,480.60,-2.606239,-2.442345,0.0000,0,0.0000,-1.329380,0.826567,0.0000,0.175797,0.0000,0.0000,0,0.625527,0.0000,1.2473,379.4733,0.00,3.3791,21925.62,13.6387,0.838948,0.0000,2,0.000,0.0000,-0.176897,1,90.0000,0.784310,0.165931,0.0000,2015,0.540,0.0000,1.763969,0.000000,2,34.8912,0,1456,0.0,0.0,2.020684,0.0133,4,-2.273555,1.1382,0.00,3.998988,-0.457013,0.0000,75.96,0.8120,1.0472,3.0,0.0000,2.0,0.00,0.000,0,0.000092
2,3,6.7518,1,0.0089,31.1296,47.9627,0.000,1.907240,0.000028,16537.50,0.00,6,1.1297,0.0000,0.0044,-0.335846,0.000000,-0.323353,21.2705,71.2133,0.630,0.000000,3490.4073,0.000000,0.0000,0.000,0.0000,0.923170,0.735000,0.0267,1.1227,2.840879,0.0000,3,0.00,0.4035,0.000,1.0,3,0.0000,0.0000,0.0000,0.00,6.346902,10.5500,0.0000,0.0000,67.082,1552.56,0.0000,0.026616,0.0,0.0000,1,0.000,2,1.0945,0.81,6.631989,3,0.001524,60.0000,-2.126982,0.0658,-0.261761,60.0000,0.0,5,0.0,0.0,0.0000,295.4657,2.0,0.235780,0.0000,0.657043,0.0,0.6360,0.0000,0.00,0.0000,1.980,-0.166256,0.63,0.0000,0.0000,0.000000,42.7866,0.00,-0.974155,0.0000,0.0933,1.1248,0.0000,0.0000,0.0000,0.000056,0.63,7.205096,0.00,0.9067,4,3,1,0.0000,0.0000,0.0000,1,0.0622,0.0,0.0,0.0,598.86,0.0,0.000996,0.8,1.1773,2.7077,1.6667,1,0.00,0.0000,3,0.522825,0.0000,5,7.4688,0.0000,5,-0.658656,0.0000,0.000000,0.0,1.1347,3,0.000,0.0000,0.0,0.00,34.29,39.6613,21.8030,0.0000,2,1.26,1.907240,-0.166256,0.0000,0,0.0000,0.065581,-5.574248,1.1455,3.147528,0.0000,0.0000,0,-0.261761,0.5913,1.1610,202.0820,0.00,0.1800,5305.05,0.0453,2.258411,0.0000,2,0.630,0.1111,2.465039,1,10296.3392,-0.496219,0.949382,0.0000,2020,1.395,0.0000,6.631989,0.000000,2,2.1250,0,1548,0.0,0.0,-2.768246,1.8667,5,0.297217,1.1023,22.41,0.599163,6.346902,0.0000,1.17,18.6547,0.0000,3.0,0.0000,2.0,0.00,0.000,0,0.000000
3,3,87.7282,1,1.5689,27.8614,5.8093,0.765,-2.373402,0.006536,1470.42,0.00,6,1.1609,1.3280,0.0356,-0.684918,0.002912,-0.429417,27.0710,711.0611,0.720,0.000000,335.8771,0.000000,0.0000,0.000,0.0000,-1.187768,0.065352,2.3973,1.1280,-1.602373,0.0000,2,62.28,0.3820,0.000,1.0,3,0.0000,0.0933,0.0000,0.00,-0.880441,10.0714,1.1663,0.0000,60.000,1445.68,0.0000,0.027008,0.0,0.0000,1,0.000,2,0.0000,0.72,-0.379069,3,0.001752,120.0000,-0.163890,0.0735,2.427688,67.0820,0.0,4,0.0,0.0,0.1689,0.0000,2.0,0.773664,0.0000,0.429807,0.0,0.0000,0.0000,0.00,0.0000,23.310,1.845067,0.00,0.0000,0.0000,0.000000,44.0485,0.00,-0.992454,0.0000,0.0489,1.1268,0.0000,4.1570,9.2593,0.120008,147.06,-4.599989,65.52,0.1200,3,3,1,10.6415,1.4987,23.8255,1,0.1289,0.0,0.0,0.0,607.68,0.0,0.000000,0.6,1.1355,1.6655,11.1358,1,0.00,1.9811,3,1.040476,0.0000,5,0.0000,0.0000,4,1.026824,0.0000,0.000000,0.0,1.1383,3,2.250,0.0000,0.0,0.00,39.42,63.8640,8.9648,0.0000,1,2700.18,-2.373402,1.845067,0.0000,0,372.2724,-0.416422,3.681627,1.1905,-0.199133,0.0000,0.0000,0,2.427688,0.0000,1.2178,216.3331,0.00,51.1180,17407.44,43.7080,-1.485136,0.0000,2,3.195,0.0000,-2.860613,1,67.0820,1.717978,-1.129010,0.0000,2021,0.810,60.0000,-0.379069,0.000000,2,48.0461,0,1447,0.0,0.0,0.788583,1.2178,5,-0.472918,1.1383,0.00,0.317534,-0.880441,0.0000,1.80,19.3120,1.0923,3.0,240.0000,2.0,0.00,0.000,0,0.002768
4,2,97.0239,1,4.0756,47.6245,12.0040,0.720,-1.940630,0.000000,2571.48,0.72,5,1.1263,0.3693,0.0000,0.497241,0.000620,0.737561,44.2645,482.5136,0.720,0.000172,115.1449,0.003436,2531.4028,0.000,1.0902,3.046115,0.114288,0.0000,1.2773,-6.039875,0.0000,2,325.35,0.5197,0.810,1.0,2,0.0222,0.3022,167.7267,2.25,-2.739155,8.7091,1.1473,1.4472,60.000,1375.44,0.1253,0.038564,0.0,0.2173,1,0.000,2,0.0000,0.99,3.229866,2,0.003004,67.0820,3.266722,0.0664,0.432787,67.0820,0.0,4,0.0,0.0,0.0578,0.0000,,0.662764,0.0000,-0.164857,0.0,0.0000,1.1673,3.87,4.3541,23.310,7.379442,0.00,0.0044,12.3220,0.000000,43.8629,12.51,-2.375249,0.0222,0.1644,1.1347,1.0909,0.7509,5.7200,0.162108,0.00,-5.330209,13.95,0.2089,3,3,1,19.0744,6.1440,0.0000,1,0.1156,0.0,0.0,0.0,867.69,0.0,0.000000,3.3,1.0973,5.0370,0.0000,1,77.31,4.6525,3,0.268656,1.8333,5,0.0000,3.3571,3,-0.686159,0.0293,0.000028,0.0,1.1383,3,3.240,1.1281,0.0,0.63,67.59,101.6040,13.5221,1.9347,1,3647.43,-1.940630,7.379442,3035.3418,0,658.6350,-0.045303,1.402204,0.0000,-1.367509,0.0000,3.4583,0,0.432787,0.0000,1.1839,408.0441,0.63,47.2410,14912.19,84.0573,-4.168653,0.0000,2,0.000,0.0000,-0.425156,1,67.0820,-1.890956,0.428630,0.1867,2016,1.215,0.0000,3.229866,0.000556,2,78.6402,0,1371,0.0,0.0,2.342365,1.4844,4,0.965675,1.1383,0.00,-0.960147,-2.739155,0.2319,1.53,22.9813,1.0699,3.0,123.6932,2.0,0.00,0.000,0,0.014460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6260,5,64.4097,3,0.0000,24.3589,3.0053,0.720,-2.234310,0.083360,236.16,0.63,4,1.2009,2.9413,0.3600,1.261977,0.005200,2.442722,26.4205,990.6015,0.000,0.000000,19.9226,0.002856,67.0820,0.585,0.0000,5.816715,0.010496,16.5640,0.0160,-5.622266,1.0589,1,50.76,0.3346,0.000,0.0,6,0.8222,0.2711,210.0000,0.81,-2.482120,1.2000,1.1347,6.0270,60.000,352.70,0.0000,0.056148,0.0,11.5933,1,1.625,1,1.1082,0.00,4.640878,6,0.000024,140.7775,0.265302,0.0715,-2.393854,60.0000,0.0,6,0.0,1.0,0.5600,67.0820,1.0,0.809056,0.0347,-0.105289,0.0,4.6173,1.0921,0.00,9.3527,14.805,5.553127,0.81,0.0311,7.9630,0.000052,59.4895,481.77,0.387232,0.0000,0.0444,1.0803,1.1355,0.5753,15.2055,0.000000,1875.60,-0.858579,117.00,0.5111,2,2,3,13.9167,1.7787,37.7003,3,0.0044,0.0,0.0,0.0,1263.33,60.0,0.008808,4.7,1.0472,0.0000,21.7336,1,64.26,0.6040,2,-1.093845,4.2105,6,18.5426,0.0000,6,0.537753,0.2133,0.000332,60.0,1.1100,2,0.630,1.1065,0.0,7.47,0.54,56.2320,10.9417,1.1453,5,0.00,-2.234310,5.553127,0.0000,0,456.9464,-0.712108,1.206151,1.1375,1.465632,0.3483,29.6667,0,-2.393854,3.9144,1.1347,0.0000,1.08,0.0000,18203.76,0.0000,-0.265892,0.0089,6,10.800,0.4178,-4.373753,1,0.0000,0.958902,-0.150691,0.1022,2021,0.540,67.0820,4.640878,0.021412,5,0.0000,1,365,0.0,0.0,4.993787,0.6844,6,0.162529,0.0000,198.18,-1.841729,-2.482120,0.0000,0.81,37.4347,1.0516,2.0,182.4829,1.0,1.17,0.135,0,0.002256
6261,2,0.0000,1,0.7556,4.9334,0.0000,7.740,1.236457,0.000000,0.00,0.00,6,0.0000,0.1680,0.0000,0.561012,0.000688,0.149861,4.9925,7209.7789,0.720,0.000000,0.0000,0.000000,0.0000,0.000,0.0000,2.112211,0.000000,0.0000,0.6520,-0.076278,0.0000,3,0.00,0.0374,0.000,1.0,2,0.0000,0.0000,0.0000,0.00,-1.795855,7.3235,0.0000,0.0000,60.000,1421.48,0.0000,0.006388,0.0,0.0000,1,0.000,2,0.0000,0.81,-1.292921,2,0.001156,0.0000,2.625066,0.0637,1.808144,0.0000,0.0,4,0.0,0.0,0.0089,0.0000,2.0,0.981024,0.0000,-0.274839,0.0,0.0000,0.0000,0.00,0.0000,0.000,5.083203,0.00,0.0000,0.0000,0.000000,12.1125,0.00,0.466383,0.0000,0.0000,1.0984,0.0000,1.8900,2.3333,0.010744,0.00,-1.633875,15.48,0.0044,4,3,1,0.0000,0.0000,0.0000,1,0.0844,0.0,0.0,0.0,143.73,0.0,0.000000,1.3,1.0831,1.5734,0.0000,1,0.00,0.0000,3,0.426512,0.0000,4,0.0000,0.0000,3,-0.488607,0.0000,0.000000,0.0,1.1328,3,0.000,0.0000,0.0,0.00,26.01,10.4480,0.0000,0.0000,1,241.74,1.236457,5.083203,0.0000,0,3134.5335,0.116552,-1.691418,0.0000,0.492671,0.0000,0.0000,0,1.808144,0.0000,1.3520,569.2100,0.00,1.8211,22073.04,7.4493,-1.405560,0.0000,2,0.000,0.0000,8.322939,1,84.8528,1.089389,2.448507,0.0000,2020,0.900,0.0000,-1.292921,0.000000,2,27.0673,0,1403,0.0,0.0,-2.688401,0.0267,4,0.134865,1.1395,0.00,-0.633933,-1.795855,0.0000,143.73,2.5760,1.0858,3.0,0.0000,2.0,0.00,0.000,0,0.000000
6262,3,5.6391,1,0.8356,4.5338,0.4613,0.900,3.210738,0.001504,76.41,1.62,6,1.1010,0.0187,0.0044,0.476302,0.000040,-0.580793,4.6090,10326.9166,0.630,0.000000,7.6050,0.000144,591.6925,0.000,0.0000,1.645344,0.003396,0.1760,0.0000,-1.352946,0.0000,3,71.91,0.0373,0.000,1.0,3,0.0267,0.1911,120.0000,0.63,-4.256409,0.0000,1.1289,0.1855,60.000,1408.54,0.0000,0.000868,0.0,0.1467,1,0.000,2,0.0000,110.79,-3.812181,3,0.000000,1056.8349,-0.659100,0.0530,2.537036,94.8683,0.0,3,0.0,0.0,0.0044,0.0000,,0.981128,0.0000,1.075724,0.0,0.0000,0.0000,0.00,0.5400,38.205,-0.676031,0.00,0.0000,3.0000,0.000000,5.4333,4.05,-0.736903,0.0000,0.0089,1.0600,1.1093,0.0000,1.0000,0.009544,33.84,-10.149250,0.90,0.0222,4,3,1,12.5088,1.9013,0.0000,1,0.0000,0.0,0.0,0.0,19.53,0.0,0.000000,0.6,0.0000,0.0000,1.6923,1,3.24,2.8463,3,0.553616,0.0000,4,0.0000,0.0000,5,-0.387347,0.0000,0.000000,0.0,1.1141,3,0.810,1.1436,0.0,0.00,0.00,9.3547,2.9322,0.0947,1,214.74,3.210738,-0.676031,0.0000,0,0.0000,0.457135,1.963269,1.0835,-0.688746,0.0000,3.9286,0,2.537036,0.0000,1.1267,0.0000,0.00,3.6154,22075.38,6.6747,4.492799,0.0000,2,33.840,0.0000,5.103708,1,152.9706,-0.492499,0.677394,0.0089,2021,0.000,0.0000,-3.812181,0.000180,2,25.6327,0,1412,0.0,0.0,1.734925,0.0133,5,-0.119728,1.1141,0.00,2.699156,-4.256409,0.0000,1.17,0.4200,1.0223,3.0,108.1665,2.0,0.00,0.000,0,0.003196
6263,3,0.0000,1,2.6889,19.6045,0.0000,0.000,-5.095745,0.000000,0.00,0.00,5,0.0000,0.0000,0.0000,0.603294,0.000000,-0.139157,19.7540,2153.3850,0.720,0.004212,0.0000,0.000000,247.3863,0.000,1.1403,-3.782023,0.000000,0.0000,0.1600,0.613642,0.0000,2,46.08,0.1994,0.720,0.0,4,0.0267,0.0711,0.0000,2.43,-2.106944,4.0000,1.1639,1.8484,60.000,1546.90,2.7707,0.001344,0.0,0.3520,1,0.000,2,0.0000,0.81,-1.752915,3,0.000208,0.0000,-1.292361,0.0623,-0.200127,0.0000,1.0,5,0.0,0.0,0.0000,0.0000,2.0,0.888848,0.0000,-1.138674,0.0,0.0000,0.0000,94.77,0.0000,0.000,3.337798,0.00,0.0000,0.0000,0.000000,5.2162,15.75,-0.033797,0.2844,0.0000,1.1138,1.1312,0.0000,0.0000,0.102640,0.00,4.492470,0.00,0.0044,3,2,1,9.2174,1.1293,0.0000,1,0.0400,0.0,0.0,0.0,30.24,0.0,0.000000,0.4,1.0699,0.0927,0.0000,1,0.00,1.3288,3,1.127725,0.0000,5,0.0000,16.0462,5,-1.539538,0.0000,0.000000,0.0,1.1383,3,2.610,0.0000,0.0,0.00,4.68,46.7973,0.0000,0.0000,2,2309.40,-5.095745,3.337798,114.0833,0,0.0000,2.019932,-2.399245,0.0000,0.285714,0.0000,4.9259,0,-0.200127,0.0000,1.2632,1441.5617,0.00,43.5941,19999.08,48.3027,-5.586931,0.0000,2,0.000,0.0000,1.790167,1,67.0820,0.246319,5.818289,0.0000,2019,0.450,0.0000,-1.752915,0.000700,2,56.5981,0,1534,0.0,0.0,-3.153808,0.3733,5,0.337841,1.1415,0.00,0.736060,-2.106944,1.8361,30.24,0.5093,0.0000,3.0,396.1158,,0.00,0.000,0,0.002048


## Clean and reporting of numbers

In [14]:
df_train.to_pickle("data/train.pkl")
df_test.to_pickle("data/test.pkl")