In [1]:
import piplite
await piplite.install('seaborn')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load dataset
dt=pd.read_csv("./budget.csv")
dt.shape

(264, 63)

In [6]:
is_null_sum=dt.isnull().sum()

In [7]:
num_var=dt.select_dtypes(include=['int64', 'float64']).columns
num_var_miss = [var for var in num_var if is_null_sum[var]>0]

In [8]:
num_var_miss

['1960',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018']

In [10]:
num_var_mean=['1960',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974']
num_var_median=['1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995']
num_var_mode=['1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004']
num_var_const=[
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018']

In [11]:
num_var_mean_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
num_var_median_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
num_var_mode_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent"))])
num_var_const_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value=0))])

In [13]:
preProcessor=ColumnTransformer(transformers=[("mean_imputer", num_var_mean_imputer, num_var_mean),("median_imputer", num_var_median_imputer, num_var_median), ("mode_imputer",num_var_mode_imputer, num_var_mode), ("const_imputer", num_var_const_imputer, num_var_const)])

In [14]:
preProcessor.fit(dt)

In [16]:
preProcessor.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([4.48237418e+09, 4.45064926e+09, 4.52029730e+09, 4.46243467e+09,
       4.52573018e+09, 4.42937952e+09, 5.09843570e+09, 5.75774357e+09,
       5.66145643e+09, 5.86619781e+09, 5.64163246e+09, 5.65538881e+09,
       6.13913722e+09, 6.49430221e+09, 7.61527220e+09])

In [17]:
preProcessor.named_transformers_["median_imputer"].named_steps["imputer"].statistics_

array([5.35015353e+08, 5.85254161e+08, 6.86893244e+08, 6.97560172e+08,
       8.16102993e+08, 9.19373639e+08, 9.75581463e+08, 9.89333333e+08,
       9.16175972e+08, 9.04177805e+08, 9.11349661e+08, 8.67538986e+08,
       9.25756539e+08, 9.99951023e+08, 1.41000485e+09, 1.38758054e+09,
       1.30028439e+09, 1.10235723e+09, 1.18103480e+09, 9.79531817e+08,
       1.17261298e+09])

In [18]:
preProcessor.named_transformers_["mode_imputer"].named_steps["imputer"].statistics_

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
preProcessor.named_transformers_["const_imputer"].named_steps["imputer"].statistics_

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [21]:
dt["1961"].mean()

4450649264.648875

In [23]:
dt_clean = preProcessor.transform(dt)

In [24]:
# remider by default is on 'drop' so it will drop all non missing value so. Hence to prevent this we will change it to 'passthrough'.

In [26]:
dt_clean_missing_now_filled=pd.DataFrame(dt_clean, columns=num_var_mean+num_var_median+num_var_mode+num_var_const)

In [28]:
dt_clean_missing_now_filled.isnull().sum().sum()

0

In [29]:
dt_clean_missing_now_filled.value_counts()

1960          1961          1962          1963          1964          1965          1966          1967          1968          1969          1970          1971          1972          1973          1974          1975          1976          1977          1978          1979          1980          1981          1982          1983          1984          1985          1986          1987          1988          1989          1990          1991          1992          1993          1994          1995          1996          1997          1998          1999          2000          2001          2002          2003          2004          2005          2006          2007          2008          2009          2010          2011          2012          2013          2014          2015          2016          2017          2018        
4.482374e+09  4.450649e+09  4.520297e+09  4.462435e+09  4.525730e+09  4.429380e+09  5.098436e+09  5.757744e+09  5.661456e+09  5.866198e+09  5.641632e+09  5.655389e+09  6.13913

In [31]:
dt_final=pd.merge(dt, dt_clean_missing_now_filled)
dt_final

Unnamed: 0,Name,Code,Type,Indicator Name,1960,1961,1962,1963,1964,1965,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Australia,AUS,Country,Military expenditure (current USD),4.597601e+08,4.709601e+08,4.894401e+08,5.532801e+08,6.557601e+08,7.873601e+08,...,1.896014e+10,2.321769e+10,2.659720e+10,2.621658e+10,2.482526e+10,2.578371e+10,2.404557e+10,2.638295e+10,2.769111e+10,2.671183e+10
1,Austria,AUT,Country,Military expenditure (current USD),9.155910e+07,9.102985e+07,1.000270e+08,1.259599e+08,1.645947e+08,1.428957e+08,...,3.334755e+09,3.218351e+09,3.409721e+09,3.187227e+09,3.229066e+09,3.305159e+09,2.665410e+09,2.885947e+09,3.138359e+09,3.367460e+09
2,Belgium,BEL,Country,Military expenditure (current USD),3.832202e+08,3.912188e+08,4.222208e+08,4.446013e+08,4.970592e+08,5.007221e+08,...,5.620670e+09,5.244721e+09,5.499371e+09,5.168998e+09,5.263165e+09,5.191509e+09,4.202063e+09,4.314102e+09,4.484653e+09,4.959692e+09
3,Burkina Faso,BFA,Country,Military expenditure (current USD),1.268378e+06,1.643154e+06,4.901761e+06,5.281288e+06,5.358593e+06,3.509330e+06,...,1.273333e+08,1.237005e+08,1.388509e+08,1.477297e+08,1.661363e+08,1.771670e+08,1.479347e+08,1.494674e+08,1.910658e+08,3.124676e+08
4,Brazil,BRA,Country,Military expenditure (current USD),3.827298e+08,3.423397e+08,3.874490e+08,4.419996e+08,3.542279e+08,6.328690e+08,...,2.564881e+10,3.400294e+10,3.693621e+10,3.398701e+10,3.287479e+10,3.265961e+10,2.461770e+10,2.422475e+10,2.928305e+10,2.776643e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,Latin America & the Caribbean (IDA & IBRD coun...,TLA,Regions Clubbed Economically,Military expenditure (current USD),1.041102e+09,9.521159e+08,1.470078e+09,1.558299e+09,1.478294e+09,1.824899e+09,...,5.652421e+10,6.948861e+10,7.469882e+10,7.649189e+10,7.973844e+10,7.455081e+10,6.156557e+10,5.727127e+10,6.557719e+10,6.418384e+10
66,Tunisia,TUN,Country,Military expenditure (current USD),1.595238e+07,1.857143e+07,1.428571e+07,1.523810e+07,1.747899e+07,1.276190e+07,...,5.647759e+08,5.711890e+08,7.152396e+08,6.812260e+08,7.593589e+08,9.083573e+08,9.794940e+08,9.877347e+08,8.589496e+08,8.442274e+08
67,Turkey,TUR,Country,Military expenditure (current USD),4.688109e+08,3.013304e+08,3.303769e+08,3.500000e+08,3.808978e+08,4.226770e+08,...,1.635230e+10,1.793937e+10,1.730488e+10,1.795824e+10,1.866257e+10,1.777217e+10,1.588093e+10,1.785398e+10,1.782401e+10,1.896711e+10
68,United States,USA,Country,Military expenditure (current USD),4.538000e+10,4.780800e+10,5.238100e+10,5.229500e+10,5.121300e+10,5.182700e+10,...,6.690000e+11,6.980000e+11,7.110000e+11,6.850000e+11,6.400000e+11,6.100000e+11,5.960000e+11,6.000000e+11,6.060000e+11,6.490000e+11
