In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
data = pd.read_csv("/kaggle/input/vehicle-sales-data/car_prices.csv")

In [None]:
data.head()

# Size Of Data

In [None]:

data.shape

In [None]:
data.info()

In [None]:
data.describe()

# Overview Of Missing Values

In [None]:
 sns.heatmap(data.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')

# with at least one row with missing value

In [None]:
missing_data = data[data.isna().any(axis = 1)]
missing_data.shape

In [None]:
percentage_missing = missing_data.shape[0]/data.shape[0]
print(f'{percentage_missing:.2f}')

# count of number of missing value by each columns.

In [None]:
 missing_values = data[['make', 'model', 'trim', 'body','transmission','state','color','interior','condition']].isna().sum()
print(missing_values)

# droppping all the null values except for transmission column. 

In [None]:
 
data.dropna(subset = ['make', 'model', 'trim', 'body', 'state', 'color', 'interior', 'condition'], inplace = True)
sns.heatmap(data.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')

In [None]:
transmission_count = data['transmission'].value_counts()
transmission_count_df = pd.DataFrame(transmission_count.reset_index())
transmission_count_df.columns = ['transmission', 'count']
total_count= transmission_count_df['count'].sum()
automatic_count = transmission_count_df.loc[transmission_count_df['transmission'] == 'automatic', 'count'].values[0]
manual_count = transmission_count_df.loc[transmission_count_df['transmission'] == 'manual', 'count'].values[0]
automatic_percentage = automatic_count/total_count * 100 
manual_percentage = manual_count/total_count * 100 
print(total_count)
print(f'The percentage of automatic vehicles is:  {automatic_percentage: .2f}')
print(f'The percentage of manual vehicles is:  {manual_percentage: .2f}')



In [None]:
total_missing = data.transmission.isna().sum()

# Filling missing values with same proportion. 

In [None]:
# missing transmission only 
missing_transmission = data['transmission'].isnull()
num_automatic = int(total_missing * 0.96)
num_manual = total_missing - num_automatic
# creating numpy array with 96% automatic and 4% manual choices. 
choices = np.random.choice(['automatic', 'manual'], size = total_missing, p = [0.96, 0.04])
#replacing missing values with choices
data.loc[missing_transmission, 'transmission'] = choices
# making sure there are no missing rows for transmission column. 
print(data['transmission'].isnull().sum())

# making sure there are no any missing data any columns

In [None]:

plt.figure(figsize = (12,8))
sns.heatmap(data.isnull(), yticklabels = False, cmap = "viridis", cbar = False)

In [None]:
data.info()

In [None]:
import warnings 
warnings.filterwarnings('ignore', category = FutureWarning)
# exploring the vehicles based on years they were manufactured 
min_year = data['year'].min()
max_year= data['year'].max()
print(min_year)
print(max_year)
sns.set_style('darkgrid')
#creating intervals of 5 years. 
bins = [min_year, 1995, 2000, 2005, 2010, max_year]
car_years = data.groupby(pd.cut(data['year'], bins = bins)).size()
car_years.plot(kind = 'bar')
plt.xlabel('Manufactured Year')
plt.ylabel('Count of Cars')
plt.title('Count of vehicle based on years manufacured')
plt.show()


# distribution based on transmission type 


In [None]:
sns.set_style('darkgrid')
car_transmission = data['transmission'].value_counts()
plt.pie(car_transmission, labels = car_transmission.index, autopct='%1.1f%%')
plt.title('Distribution of Transmission Types')
plt.axis('equal') 
plt.show()

# counting number of vehicles based on state. 

In [None]:

plt.figure(figsize = (12, 9))
car_state = data['state'].value_counts()
plt.xlabel("State")
plt.xticks([])
plt.scatter( car_state.index, car_state.values, color = 'blue')
#placing text on top of count variable inside the plt 
for i, count in enumerate(car_state.values): 
    plt.text(car_state.index[i], count + 500, car_state.index[i][:2], ha='center', va='bottom', fontsize=10)
plt.show()

# vehicles count based on condition 

In [None]:

vehicle_counts = data['condition'].value_counts()
plt.figure(figsize= (12,9))
plt.scatter(vehicle_counts.index, vehicle_counts.values, color = 'green')
for i, count in enumerate(vehicle_counts.values): 
    plt.text(vehicle_counts.index[i],count + 500, f"{vehicle_counts.index[i]:.0f}", ha = 'center', va = 'bottom', fontsize = 10 )
plt.show()

# vehicle counts based on odometer reading

In [None]:

plt.figure(figsize = (12,9))
data.odometer.max()
sns.histplot(data, x = data.odometer)

# we have a big outlier on odometer reading. Lets eliminate that

In [None]:
data.odometer.max()

# eliminating odometer reading which has values more than 500000.

In [None]:
data.drop(data[data.odometer>500000].index, axis = 0, inplace = True)
data.odometer.max()

# vehicle counts based on odometer reading

In [None]:

plt.figure(figsize = (12,9))
data.odometer.max()
sns.histplot(data, x = data.odometer)
mean = data.odometer.mean()
median = data.odometer.median()
plt.axvline(x = mean, color = 'green', linestyle = '--', label = f'Mean: {mean:.2f}')
plt.axvline(x = median, color = 'purple', linestyle = "-", label = f'Median:{median: .2f}')
plt.legend()
plt.title('Histogram of Odometer Data')
plt.xlabel("Odometer")
plt.ylabel('Frequency')
plt.show()

In [None]:
vehicle_color = data['color'].value_counts()
print(vehicle_color.head)

