# Step 1 - Data Understanding and Cleaning

The Step 1 section will help us to understand the data structure.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

**Load and visualize the datasets:**

In [2]:
# Load the CSV file for the previous data
previous_years_df = pd.read_csv('../data/air_system_previous_years.csv')
previous_years_df

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,neg,153002,na,664,186,0,0,0,0,0,...,998500,566884,1290398,1218244,1019768,717762,898642,28588,0,0
59996,neg,2286,na,2130706538,224,0,0,0,0,0,...,10578,6760,21126,68424,136,0,0,0,0,0
59997,neg,112,0,2130706432,18,0,0,0,0,0,...,792,386,452,144,146,2622,0,0,0,0
59998,neg,80292,na,2130706432,494,0,0,0,0,0,...,699352,222654,347378,225724,194440,165070,802280,388422,0,0


In [3]:
# Load the CSV file for the present data
present_year_df = pd.read_csv('../data/air_system_present_year.csv')
present_year_df

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60,0,20,12,0,0,0,0,0,...,1098,138,412,654,78,88,0,0,0,0
1,neg,82,0,68,40,0,0,0,0,0,...,1068,276,1620,116,86,462,0,0,0,0
2,neg,66002,2,212,112,0,0,0,0,0,...,495076,380368,440134,269556,1315022,153680,516,0,0,0
3,neg,59816,na,1010,936,0,0,0,0,0,...,540820,243270,483302,485332,431376,210074,281662,3232,0,0
4,neg,1814,na,156,140,0,0,0,0,0,...,7646,4144,18466,49782,3176,482,76,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,neg,81852,na,2130706432,892,0,0,0,0,0,...,632658,273242,510354,373918,349840,317840,960024,25566,0,0
15996,neg,18,0,52,46,8,26,0,0,0,...,266,44,46,14,2,0,0,0,0,0
15997,neg,79636,na,1670,1518,0,0,0,0,0,...,806832,449962,778826,581558,375498,222866,358934,19548,0,0
15998,neg,110,na,36,32,0,0,0,0,0,...,588,210,180,544,1004,1338,74,0,0,0


**Handling Missing Values:**

Replace the string values "na" with the appropriate Python representation for missing data (pd.NA), and then combine the datasets for further processing.

In [4]:
# Replace 'na' with np.nan and then drop rows with any NaN values
previous_years_df.replace('na', pd.NA, inplace=True)
present_year_df.replace('na', pd.NA, inplace=True)

# Combine both datasets
combined_df = pd.concat([previous_years_df, present_year_df], ignore_index=True)
combined_df

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,,0,,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75995,neg,81852,,2130706432,892,0,0,0,0,0,...,632658,273242,510354,373918,349840,317840,960024,25566,0,0
75996,neg,18,0,52,46,8,26,0,0,0,...,266,44,46,14,2,0,0,0,0,0
75997,neg,79636,,1670,1518,0,0,0,0,0,...,806832,449962,778826,581558,375498,222866,358934,19548,0,0
75998,neg,110,,36,32,0,0,0,0,0,...,588,210,180,544,1004,1338,74,0,0,0


## Multiple choice questions about the *‘air_system_previous_years.csv'*

#### 2 - What is the percentage of trucks with defects?

In [11]:
# Calculate percentages
previous_defects_count = previous_years_df['class'].value_counts()
previous_percentage_defects = (previous_defects_count['pos'] / previous_defects_count.sum()) * 100

print(f"Percentage of trucks with defects in the previous years: {previous_percentage_defects:.2f}%")

Percentage of trucks with defects in the previous years: 1.67%


#### 3 - What are the mean, median, and standard deviation of the column ‘ee_003’ considering null values? 

In [12]:
previous_years_df['ee_003'] = pd.to_numeric(previous_years_df['ee_003'], errors='coerce')

mean_ee_003 = previous_years_df['ee_003'].mean()
median_ee_003 = previous_years_df['ee_003'].median()
std_ee_003 = previous_years_df['ee_003'].std()

print(f"Mean of ee_003: {mean_ee_003}")
print(f"Median of ee_003: {median_ee_003}")
print(f"Standard Deviation of ee_003: {std_ee_003}")

Mean of ee_003: 211126.44730233107
Median of ee_003: 112086.0
Standard Deviation of ee_003: 543318.8167085947


In [10]:
non_numeric_values = combined_df[~pd.to_numeric(combined_df['ee_003'], errors='coerce').notnull()]
non_numeric_values

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
56,neg,24836,,,,,,,,,...,,,,,,,,,,
109,neg,10,,20,2,2,2,,,,...,,,,,,,,,0,0
115,pos,762958,,,,,,776,281128,2186308,...,,,,,,,,,,
164,neg,122082,,866,,0,0,,,,...,,,,,,,,,0,0
204,neg,52758,,220,,0,0,,,,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75631,neg,34,,8,,0,0,,,,...,,,,,,,,,0,0
75653,neg,562,,24,14,0,0,,,,...,,,,,,,,,0,0
75673,neg,952,0,38,20,0,0,,,,...,,,,,,,,,0,0
75735,neg,1500,,0,,0,0,,,,...,,,,,,,,,0,0
