# Exercise 02. Preprocessing

#### - Turn-in directory: `ex02/`.
#### - Files to turn in: `preprocessing.ipynb`.
#### - Allowed functions: `import pandas as pd`.

In [1]:
import pandas as pd

####
## 1. [Download](https://drive.google.com/open?id=1kFMUiXtrlw5B6WTjJ-mUm4-STV28Zsvn) and read the CSV file, making `ID` the index column:

In [2]:
file_path = '../data/auto.csv'
df = pd.read_csv(file_path, index_col='ID')

In [3]:
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


####
## 2. Count the number of observations using the method `count()`:

In [4]:
input_count = df.count()
input_count

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

####
## 3. Drop the duplicates, taking into account only the following columns: `CarNumber`, `Make_n_Model`, and `Fines`:
   - Between two equal observations, choose the last one.
   - Check the number of observations again.

In [5]:
df = df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep='last')

In [6]:
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
5,92918M178RUS,Ford Focus,1.0,5700.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


In [7]:
df.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

####
## 4. Work with missing values:
   - Check how many values are missing from each column.
   - Drop all columns with over 500 missing values using the argument `thresh`. Check how many missing values are in each column.
   - Replace all the missing values in the `Refund` column with the previous value in that column for that cell. Use the argument method and check how many values are missing from each column.
   - Replace all the missing values in the `Fines` column with the mean value of this column (excluding NA/NULL values when computing the mean). Check how many values are missing from each column.

In [8]:
df.isnull().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [9]:
df = df.dropna(axis=1, thresh=500).copy() #had to add copy because apparently pandas can creat pointer to data when dropping duplicates or dropping columns, instead of making a full copy, even if df = df.dropna()

In [10]:
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0
1,E432XX77RUS,Toyota Camry,1.0,6500.0
2,7184TT36RUS,Ford Focus,1.0,2100.0
3,X582HE161RUS,Ford Focus,2.0,2000.0
5,92918M178RUS,Ford Focus,1.0,5700.0
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0
927,M0309X197RUS,Ford Focus,1.0,22300.0
928,O673E8197RUS,Ford Focus,2.0,600.0
929,8610T8154RUS,Ford Focus,1.0,2000.0


In [11]:
df.isnull().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

In [12]:
df['Refund'] = df['Refund'].fillna(method='ffill')

In [13]:
df.isnull().sum()

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [14]:
fines_mean = df['Fines'].mean()
fines_mean

8594.586466165414

In [15]:
df['Fines'] = df['Fines'].fillna(value=fines_mean)

In [16]:
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.000000
1,E432XX77RUS,Toyota Camry,1.0,6500.000000
2,7184TT36RUS,Ford Focus,1.0,2100.000000
3,X582HE161RUS,Ford Focus,2.0,2000.000000
5,92918M178RUS,Ford Focus,1.0,5700.000000
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.000000
927,M0309X197RUS,Ford Focus,1.0,22300.000000
928,O673E8197RUS,Ford Focus,2.0,600.000000
929,8610T8154RUS,Ford Focus,1.0,2000.000000


In [17]:
df.isnull().sum()

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

####
## 5. Split and parse the make and model:
   - Use the apply method for both splitting and extracting values to the new columns, `Make` and `Model`.
   - Drop the column `Make_n_Model`.
   - Save the dataframe in the `auto.json` JSON file in the format below:
    ```
    [{"CarNumber":"Y163O8161RUS","Refund":2.0,"Fines":3200.0,"Make":"Ford",
    "Model":"Focus"},
    {"CarNumber":"E432XX77RUS","Refund":1.0,"Fines":6500.0,"Make":"Toyota",
    "Model":"Camry"}]
    ```

In [18]:
# df[['Make', 'Model']] = df['Make_n_model'].apply(lambda x: pd.Series(str(x).split(' ', 1)))
def split_col_data(text):
    parts = text.split(' ', 1)
    
    if len(parts) == 1:
        return pd.Series([parts[0], pd.NA])
        
    return pd.Series(parts)
    
df[['Make', 'Model']] = df['Make_n_model'].apply(split_col_data)
# df['Model'].values

In [19]:
df = df.drop(columns='Make_n_model')
df

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus


In [20]:
save_file_path = '../data/auto.json'
df.to_json(save_file_path, orient='records') # ,indent=4

In [21]:
# #testing if can open file and printing it:
# test_save_df = pd.read_json(save_file_path)
# print(test_save_df.to_json(orient='records', indent=4))

In [22]:
# test_save_df['Model'].values

In [23]:
# import pandas as pd

# def preprocess():
#     try:
#         file_path = '../data/auto.csv'
#         df = pd.read_csv(file_path, index_col='ID')
#         return df
#     except Exception as e:
#         print(e)

# if __name__ == '__main__':
#     df = preprocess()