# importing Libraries

In [1]:
## importing libraries for loading and preprocessing data 
import numpy as np
import pandas as pd

# Reading/Loading And Creating a Copy of Dataset

In [2]:
data = pd.read_csv("gdap_cc_project/Datasets/Toyota.csv")


In [3]:
## creating a deep copy of the dataset 
df1 = data.copy(deep=True)

In [4]:
df1.head() ## print the first five rows

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [5]:
df1.tail()  ## prints the last five rows

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
1431,1431,7500,,20544,Petrol,86,1.0,0,1300,3,1025
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015
1433,1433,8500,,17016,Petrol,86,0.0,0,1300,3,1015
1434,1434,7250,70.0,??,,86,1.0,0,1300,3,1015
1435,1435,6950,76.0,1,Petrol,110,0.0,0,1600,5,1114


In [6]:
df1.columns ## displays the title or names of each column in the data set.

Index(['Unnamed: 0', 'Price', 'Age', 'KM', 'FuelType', 'HP', 'MetColor',
       'Automatic', 'CC', 'Doors', 'Weight'],
      dtype='object')

In [7]:
for each_item in df1.columns: ## using for loop to alterate the data
    print(each_item.upper())

UNNAMED: 0
PRICE
AGE
KM
FUELTYPE
HP
METCOLOR
AUTOMATIC
CC
DOORS
WEIGHT


In [8]:
df1['Price'] ## display all records in the price column and their respective index

0       13500
1       13750
2       13950
3       14950
4       13750
        ...  
1431     7500
1432    10845
1433     8500
1434     7250
1435     6950
Name: Price, Length: 1436, dtype: int64

In [9]:
## acceass a single value for a row/col by their label and index 
df1.at[2, 'Weight']

1165

In [10]:
df1.iat[2, 10] # display the value of a column at the index or position

1165

In [11]:
## accessing a group of rows and column by their label 

df1.loc[ 0: 12, 'Price']

0     13500
1     13750
2     13950
3     14950
4     13750
5     12950
6     16900
7     18600
8     21500
9     12950
10    20950
11    19950
12    19600
Name: Price, dtype: int64

In [12]:
df1.loc[50:70, 'Price'] # display the price of the car by the location or range.

50    17950
51    15750
52    20500
53    21950
54    15500
55    13250
56    15250
57    15250
58    18950
59    15999
60    14950
61    16500
62    18750
63    17950
64    17950
65    16950
66    18950
67    14950
68    22250
69    15950
70    15950
Name: Price, dtype: int64

In [13]:
df1.loc[10:15, ]

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
10,10,20950,25.0,31461,Petrol,192,0.0,0,1800,3,1185
11,11,19950,22.0,43610,Petrol,192,0.0,0,1800,3,1185
12,12,19600,25.0,32189,Petrol,192,0.0,0,1800,3,1185
13,13,21500,31.0,23000,Petrol,192,1.0,0,1800,3,1185
14,14,22500,32.0,34131,Petrol,192,1.0,0,1800,3,1185
15,15,22000,28.0,18739,Petrol,????,0.0,0,1800,3,1185


In [14]:
## returns the size of the dataset
df1.size

15796

In [15]:
## lets return the shape of the dataset 
df1.shape

(1436, 11)

In [16]:
print("Total number of rows: ", df1.shape[0], "Total number of columns: " , df1.shape[1]) 

Total number of rows:  1436 Total number of columns:  11


In [17]:
# using the f' string
print(f"Total number of rows:,{df1.shape[0]} ,Total number of columns:{df1.shape[1]}")

Total number of rows:,1436 ,Total number of columns:11


In [18]:
df1.info() # display all the information about the data set.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1436 non-null   int64  
 1   Price       1436 non-null   int64  
 2   Age         1336 non-null   float64
 3   KM          1436 non-null   object 
 4   FuelType    1336 non-null   object 
 5   HP          1436 non-null   object 
 6   MetColor    1286 non-null   float64
 7   Automatic   1436 non-null   int64  
 8   CC          1436 non-null   int64  
 9   Doors       1436 non-null   object 
 10  Weight      1436 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 123.5+ KB


In [19]:
# returns the columns and their data types
df1.dtypes

Unnamed: 0      int64
Price           int64
Age           float64
KM             object
FuelType       object
HP             object
MetColor      float64
Automatic       int64
CC              int64
Doors          object
Weight          int64
dtype: object

In [20]:
## returns the number of unique value
df1.nunique()

Unnamed: 0    1436
Price          236
Age             77
KM            1256
FuelType         3
HP              13
MetColor         2
Automatic        2
CC              12
Doors            7
Weight          59
dtype: int64

In [21]:
df1['Price'].unique()

array([13500, 13750, 13950, 14950, 12950, 16900, 18600, 21500, 20950,
       19950, 19600, 22500, 22000, 22750, 17950, 16750, 16950, 15950,
       16250, 17495, 15750, 15500, 14750, 19000, 15800, 21950, 20500,
       13250, 15250, 18950, 15999, 16500, 18750, 22250, 12995, 18450,
       16895, 14900, 17250, 15450, 16650, 17450, 16450, 18900, 18990,
       18500, 19450, 18800, 32500, 31000, 31275, 24950, 22950, 24990,
       17900, 19250, 16350, 21750, 15850, 23000, 19900, 23950, 24500,
       17200, 19500, 16868, 19750, 20750, 17650, 17795, 18245, 23750,
       18700, 21125,  6950,  9500, 11950,  7750,  4350,  4750, 11750,
       11900,  9950, 11495, 11250, 10500, 10450, 11500, 12500, 10950,
       11450, 11790, 12450, 11690, 12750, 11925, 12900, 11650, 10850,
        9940, 13450, 12495, 12000, 11480, 14990, 12850, 11700, 11895,
       13875, 12295, 13995,  9900, 11990, 10750, 11695, 11000, 12400,
       12200, 12695, 14350, 10250,  6500,  6400,  7000,  8900,  8500,
        8950,  9250,

In [22]:
df1['Doors'].unique()

array(['three', '3', '5', '4', 'four', 'five', '2'], dtype=object)

In [23]:
df1['Age'].unique()

array([23., 24., 26., 30., 32., 27., 25., 22., 31., 28., 29., nan, 20.,
       19., 17., 11., 18., 13., 14.,  4.,  8.,  7., 16., 15., 10., 12.,
        9.,  6.,  2.,  1., 43., 38., 40., 44., 41., 37., 39., 42., 35.,
       34., 33., 36., 53., 51., 54., 45., 55., 48., 52., 49., 56., 50.,
       47., 46., 58., 68., 59., 62., 65., 67., 64., 60., 61., 66., 57.,
       63., 73., 79., 78., 74., 77., 69., 80., 76., 70., 71., 75., 72.])

In [24]:
df1['FuelType'].unique()

array(['Diesel', nan, 'Petrol', 'CNG'], dtype=object)

In [25]:
df1['HP'].unique()

array(['90', '????', '192', '110', '97', '71', '116', '98', '69', '86',
       '72', '107', '73'], dtype=object)

In [26]:
df1['KM'].unique()

array(['46986', '72937', '41711', ..., '30964', '20544', '17016'],
      dtype=object)

In [27]:
df1['MetColor'].unique()

array([ 1., nan,  0.])

In [28]:
df1['Automatic'].unique()

array([0, 1], dtype=int64)

In [29]:
df1['CC'].unique()

array([2000, 1800, 1900, 1600, 1400, 1598, 1995, 1398, 1300, 1587, 1975,
       1332], dtype=int64)

In [30]:
df1['Weight'].unique()

array([1165, 1170, 1245, 1185, 1105, 1065, 1120, 1100, 1255, 1270, 1110,
       1195, 1180, 1075, 1130, 1275, 1060, 1115, 1265, 1260, 1125, 1155,
       1045, 1480, 1320, 1280, 1135, 1090, 1150, 1085, 1160, 1205, 1084,
       1140, 1095, 1025, 1119, 1080, 1121, 1615, 1067, 1040, 1030, 1055,
       1050, 1103, 1070, 1035, 1015, 1000, 1078, 1079, 1109, 1020, 1010,
       1114, 1172, 1094, 1083], dtype=int64)

In [31]:
# returns the number of missing values in each column
df1.isnull().sum()

Unnamed: 0      0
Price           0
Age           100
KM              0
FuelType      100
HP              0
MetColor      150
Automatic       0
CC              0
Doors           0
Weight          0
dtype: int64

In [32]:
df1['HP'].replace('????', np.nan)

0        90
1        90
2        90
3        90
4        90
       ... 
1431     86
1432     86
1433     86
1434     86
1435    110
Name: HP, Length: 1436, dtype: object

In [33]:
df1['HP'] =df1['HP'].replace('????', np.nan)

In [34]:
df1['HP'].unique()

array(['90', nan, '192', '110', '97', '71', '116', '98', '69', '86', '72',
       '107', '73'], dtype=object)

In [41]:
# convert column HP to float
df1['HP']=df1['HP'].astype('float')

In [42]:
df1.dtypes

Unnamed: 0      int64
Price           int64
Age           float64
KM             object
FuelType       object
HP            float64
MetColor      float64
Automatic       int64
CC              int64
Doors          object
Weight          int64
dtype: object

In [44]:
df1.isnull().sum()

Unnamed: 0      0
Price           0
Age           100
KM              0
FuelType      100
HP              6
MetColor      150
Automatic       0
CC              0
Doors           0
Weight          0
dtype: int64

In [45]:
df1.head()

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90.0,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90.0,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90.0,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90.0,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90.0,0.0,0,2000,3,1170


In [47]:
## fix / inputs missing values for HP
mean_HP= df1['HP'].mean()

In [48]:
print(mean_HP)

101.47832167832168


In [49]:
df1['HP'].fillna(mean_HP)

0        90.0
1        90.0
2        90.0
3        90.0
4        90.0
        ...  
1431     86.0
1432     86.0
1433     86.0
1434     86.0
1435    110.0
Name: HP, Length: 1436, dtype: float64

In [51]:
df1['KM'] =df1['KM'].replace('...', np.nan)

In [53]:
df1['KM'].unique()

array(['46986', '72937', '41711', ..., '30964', '20544', '17016'],
      dtype=object)