In [62]:
import pandas as pd

In [63]:
car_sales = pd.read_csv(r"C:\Users\nazli\Desktop\car-sales.csv")

# Manipulating Data

In [64]:
car_sales["Make"].str.lower()

0    toyota
1     honda
2    toyota
3       bmw
4    nissan
5    toyota
6     honda
7     honda
8    toyota
9    nissan
Name: Make, dtype: object

 converts all text in the Make column of the car_sales DataFrame to lowercase

In [65]:
car_sales["Make"] = car_sales["Make"].str.lower()


In [66]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,toyota,White,150043,4,"$4,000.00"
1,honda,Red,87899,4,"$5,000.00"
2,toyota,Blue,32549,3,"$7,000.00"
3,bmw,Black,11179,5,"$22,000.00"
4,nissan,White,213095,4,"$3,500.00"
5,toyota,Green,99213,4,"$4,500.00"
6,honda,Blue,45698,4,"$7,500.00"
7,honda,Blue,54738,4,"$7,000.00"
8,toyota,White,60000,4,"$6,250.00"
9,nissan,White,31600,4,"$9,700.00"


### Working with missing data

In [67]:
car_sales_missing = pd.read_csv(r"C:\Users\nazli\Desktop\car-sales-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,,3.0,"$7,000"
3,BMW,Black,11179.0,5.0,"$22,000"
4,Nissan,White,213095.0,4.0,"$3,500"
5,Toyota,Green,,4.0,"$4,500"
6,Honda,,,4.0,"$7,500"
7,Honda,Blue,,4.0,
8,Toyota,White,60000.0,,
9,,White,31600.0,4.0,"$9,700"


In [68]:
car_sales_missing["Odometer"].mean()

92302.66666666667

 calculates the average of the numeric values '.mean()'

In [69]:
car_sales_missing["Odometer"] = car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean())

The `fillna()` method is used to fill missing (`NaN`) values in a pandas DataFrame or Series with a specified value. This helps manage missing data during data analysis and cleaning.

 it replaces missing mileage values with the average mileage value from the column.

In [70]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,92302.666667,3.0,"$7,000"
3,BMW,Black,11179.0,5.0,"$22,000"
4,Nissan,White,213095.0,4.0,"$3,500"
5,Toyota,Green,92302.666667,4.0,"$4,500"
6,Honda,,92302.666667,4.0,"$7,500"
7,Honda,Blue,92302.666667,4.0,
8,Toyota,White,60000.0,,
9,,White,31600.0,4.0,"$9,700"


In [71]:
car_sales_missing.dropna()

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,92302.666667,3.0,"$7,000"
3,BMW,Black,11179.0,5.0,"$22,000"
4,Nissan,White,213095.0,4.0,"$3,500"
5,Toyota,Green,92302.666667,4.0,"$4,500"


Removes rows or columns containing missing (`NaN`) values.

If you only write this, it will apply temporarily. To make it permanent:

In [72]:
car_sales_missing.dropna(inplace=True)

### Add new column

In [73]:
#column from series
seats_column = pd.Series([5,5,5,5,5])

#new column called seats
car_sales["Seats"] = seats_column
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats
0,toyota,White,150043,4,"$4,000.00",5.0
1,honda,Red,87899,4,"$5,000.00",5.0
2,toyota,Blue,32549,3,"$7,000.00",5.0
3,bmw,Black,11179,5,"$22,000.00",5.0
4,nissan,White,213095,4,"$3,500.00",5.0
5,toyota,Green,99213,4,"$4,500.00",
6,honda,Blue,45698,4,"$7,500.00",
7,honda,Blue,54738,4,"$7,000.00",
8,toyota,White,60000,4,"$6,250.00",
9,nissan,White,31600,4,"$9,700.00",


In [74]:
car_sales["Seats"].fillna(5, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_sales["Seats"].fillna(5, inplace=True)


In [75]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats
0,toyota,White,150043,4,"$4,000.00",5.0
1,honda,Red,87899,4,"$5,000.00",5.0
2,toyota,Blue,32549,3,"$7,000.00",5.0
3,bmw,Black,11179,5,"$22,000.00",5.0
4,nissan,White,213095,4,"$3,500.00",5.0
5,toyota,Green,99213,4,"$4,500.00",5.0
6,honda,Blue,45698,4,"$7,500.00",5.0
7,honda,Blue,54738,4,"$7,000.00",5.0
8,toyota,White,60000,4,"$6,250.00",5.0
9,nissan,White,31600,4,"$9,700.00",5.0


### Column from python list

In [76]:
fuel_economy = [7.5, 9.2, 5.0, 9.6, 8.7, 4.7, 4.7, 1.7, 3.9, 8.8]
car_sales["Fuel per 100KM"] = fuel_economy
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats,Fuel per 100KM
0,toyota,White,150043,4,"$4,000.00",5.0,7.5
1,honda,Red,87899,4,"$5,000.00",5.0,9.2
2,toyota,Blue,32549,3,"$7,000.00",5.0,5.0
3,bmw,Black,11179,5,"$22,000.00",5.0,9.6
4,nissan,White,213095,4,"$3,500.00",5.0,8.7
5,toyota,Green,99213,4,"$4,500.00",5.0,4.7
6,honda,Blue,45698,4,"$7,500.00",5.0,4.7
7,honda,Blue,54738,4,"$7,000.00",5.0,1.7
8,toyota,White,60000,4,"$6,250.00",5.0,3.9
9,nissan,White,31600,4,"$9,700.00",5.0,8.8


In [77]:
car_sales["Total fuel used"] = car_sales["Odometer (KM)"]/100 * car_sales["Fuel per 100KM"]

In [78]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats,Fuel per 100KM,Total fuel used
0,toyota,White,150043,4,"$4,000.00",5.0,7.5,11253.225
1,honda,Red,87899,4,"$5,000.00",5.0,9.2,8086.708
2,toyota,Blue,32549,3,"$7,000.00",5.0,5.0,1627.45
3,bmw,Black,11179,5,"$22,000.00",5.0,9.6,1073.184
4,nissan,White,213095,4,"$3,500.00",5.0,8.7,18539.265
5,toyota,Green,99213,4,"$4,500.00",5.0,4.7,4663.011
6,honda,Blue,45698,4,"$7,500.00",5.0,4.7,2147.806
7,honda,Blue,54738,4,"$7,000.00",5.0,1.7,930.546
8,toyota,White,60000,4,"$6,250.00",5.0,3.9,2340.0
9,nissan,White,31600,4,"$9,700.00",5.0,8.8,2780.8


### Create a column from a single value

In [79]:
car_sales["Number of wheels"] = 4
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats,Fuel per 100KM,Total fuel used,Number of wheels
0,toyota,White,150043,4,"$4,000.00",5.0,7.5,11253.225,4
1,honda,Red,87899,4,"$5,000.00",5.0,9.2,8086.708,4
2,toyota,Blue,32549,3,"$7,000.00",5.0,5.0,1627.45,4
3,bmw,Black,11179,5,"$22,000.00",5.0,9.6,1073.184,4
4,nissan,White,213095,4,"$3,500.00",5.0,8.7,18539.265,4
5,toyota,Green,99213,4,"$4,500.00",5.0,4.7,4663.011,4
6,honda,Blue,45698,4,"$7,500.00",5.0,4.7,2147.806,4
7,honda,Blue,54738,4,"$7,000.00",5.0,1.7,930.546,4
8,toyota,White,60000,4,"$6,250.00",5.0,3.9,2340.0,4
9,nissan,White,31600,4,"$9,700.00",5.0,8.8,2780.8,4


In [80]:
car_sales =car_sales.drop("Number of wheels" , axis=1)
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats,Fuel per 100KM,Total fuel used
0,toyota,White,150043,4,"$4,000.00",5.0,7.5,11253.225
1,honda,Red,87899,4,"$5,000.00",5.0,9.2,8086.708
2,toyota,Blue,32549,3,"$7,000.00",5.0,5.0,1627.45
3,bmw,Black,11179,5,"$22,000.00",5.0,9.6,1073.184
4,nissan,White,213095,4,"$3,500.00",5.0,8.7,18539.265
5,toyota,Green,99213,4,"$4,500.00",5.0,4.7,4663.011
6,honda,Blue,45698,4,"$7,500.00",5.0,4.7,2147.806
7,honda,Blue,54738,4,"$7,000.00",5.0,1.7,930.546
8,toyota,White,60000,4,"$6,250.00",5.0,3.9,2340.0
9,nissan,White,31600,4,"$9,700.00",5.0,8.8,2780.8


In [81]:
car_sales.sample(frac=1)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats,Fuel per 100KM,Total fuel used
0,toyota,White,150043,4,"$4,000.00",5.0,7.5,11253.225
8,toyota,White,60000,4,"$6,250.00",5.0,3.9,2340.0
5,toyota,Green,99213,4,"$4,500.00",5.0,4.7,4663.011
3,bmw,Black,11179,5,"$22,000.00",5.0,9.6,1073.184
6,honda,Blue,45698,4,"$7,500.00",5.0,4.7,2147.806
9,nissan,White,31600,4,"$9,700.00",5.0,8.8,2780.8
1,honda,Red,87899,4,"$5,000.00",5.0,9.2,8086.708
2,toyota,Blue,32549,3,"$7,000.00",5.0,5.0,1627.45
4,nissan,White,213095,4,"$3,500.00",5.0,8.7,18539.265
7,honda,Blue,54738,4,"$7,000.00",5.0,1.7,930.546


sample(): This method is used to randomly sample rows from a DataFrame.
frac: This parameter specifies the fraction of rows to return. In this case, frac=1 means that 100% of the rows should be included, but their order will be randomized.

### *Only select 20% of data*

In [82]:
car_sales.sample(frac=0.2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats,Fuel per 100KM,Total fuel used
9,nissan,White,31600,4,"$9,700.00",5.0,8.8,2780.8
3,bmw,Black,11179,5,"$22,000.00",5.0,9.6,1073.184


In [83]:
car_sales["Odometer (KM)"] = car_sales["Odometer (KM)"].apply(lambda x: x / 1.6)
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Seats,Fuel per 100KM,Total fuel used
0,toyota,White,93776.875,4,"$4,000.00",5.0,7.5,11253.225
1,honda,Red,54936.875,4,"$5,000.00",5.0,9.2,8086.708
2,toyota,Blue,20343.125,3,"$7,000.00",5.0,5.0,1627.45
3,bmw,Black,6986.875,5,"$22,000.00",5.0,9.6,1073.184
4,nissan,White,133184.375,4,"$3,500.00",5.0,8.7,18539.265
5,toyota,Green,62008.125,4,"$4,500.00",5.0,4.7,4663.011
6,honda,Blue,28561.25,4,"$7,500.00",5.0,4.7,2147.806
7,honda,Blue,34211.25,4,"$7,000.00",5.0,1.7,930.546
8,toyota,White,37500.0,4,"$6,250.00",5.0,3.9,2340.0
9,nissan,White,19750.0,4,"$9,700.00",5.0,8.8,2780.8


This applies a lambda function to each value x in the "Odometer (KM)" column. The function divides each value by 1.6, effectively converting kilometers to miles.