# Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

from six.moves import urllib

warnings.filterwarnings('ignore')

%matplotlib inline

# Read Data

In [2]:
Data = pd.read_csv('car data.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'car data.csv'

In [None]:
Data

# Top 5 Records

In [None]:
Data.head()

# Shape of the dataset

In [None]:
Data.shape

301 is the number of records.
10 is the number of features.

# Summary of the dataset

In [None]:
Data.describe() # Display the summary statistics of the dataset

# Check the datatype in the data

In [None]:
Data.info() # Check Null values and Data types

# Explore the dataset

In [None]:

numeric_features = [feature for feature in Data.columns if Data[feature].dtype != 'O']
categorical_features = [feature for feature in Data.columns if Data[feature].dtype == 'O']


In [None]:
numeric_features

In [None]:
categorical_features

# Features Information 

Car_Name: This column likely contains the names or identifiers of the cars.

company: This column appears to contain the company or manufacturer name of the car.

Year: Represents the manufacturing year of the car. It is of integer type (int64).

Selling_Price: Indicates the selling price of the car. It is of float type (float64).

Present_Price: Represents the present price or current market value of the car. It is also of float type (float64).

Kms_Driven: Denotes the total kilometers the car has been driven. It is of integer type (int64).

Fuel_Type: Specifies the type of fuel the car uses, which can be categorical (object) data. Common values might include 'Petrol', 'Diesel', or 'CNG'.

Seller_Type: Represents the type of seller, possibly indicating whether the seller is an individual or a dealer. It is likely a categorical feature (object).

Transmission: Describes the type of transmission the car has, such as 'Manual' or 'Automatic'. It is also likely a categorical feature (object).

Owner: Represents the number of previous owners of the car. It is of integer type (int64).

# Proportion of count data on categorical columns in Percentage

In [None]:
for col in categorical_features:
    print(Data[col].value_counts(normalize=True) * 100)
    print('--------------------------------')

# Univariate Analysis

Univariate analysis involves the examination of a single variable at a time. Here, we'll consider various univariate analyses that can be applied to the features in your car dataset. The specific analysis depends on whether the variable is numerical or categorical.
Numerical Variables:

    Year:
        Histogram: Visualize the distribution of car manufacturing years.
        Summary Statistics: Calculate mean, median, standard deviation, etc.

    Selling_Price and Present_Price:
        Histogram: Understand the distribution of selling prices and present prices.
        Box Plot: Detect outliers and understand the spread of prices.

    Kms_Driven:
        Histogram: Examine the distribution of kilometers driven.
        Summary Statistics: Understand the central tendency and spread.

    Owner:
        Count Plot: Visualize the distribution of the number of owners.
        Value Counts: Get the count of unique owner values.

Categorical Variables:

    Fuel_Type, Seller_Type, Transmission:
        Count Plot: Visualize the distribution of different fuel types, seller types, and transmission types.

    Car_Name and Company:
        Value Counts: Count the occurrences of each car name and company.

Combined Analysis:

    Year vs. Selling_Price:
        Scatter Plot: Understand how selling prices vary with the manufacturing year.

    Fuel_Type vs. Present_Price:
        Box Plot: Visualize how the present prices vary for different fuel types.

    Transmission vs. Kms_Driven:
        Violin Plot: Explore the distribution of kilometers driven for different transmission types.

# Numerical Variables

In [None]:

# Histogram for Year
fig_year = px.histogram(Data, x='Year', nbins=20, title='Distribution of Car Manufacturing Years',
                        labels={'Year': 'Manufacturing Year', 'count': 'Frequency'},
                        color_discrete_sequence=['#87CEFA'])
fig_year.update_layout(xaxis_title='Manufacturing Year', yaxis_title='Frequency')

# Show the histogram
fig_year.show()

# Summary Statistics for Year
year_mean = Data['Year'].mean()
year_median = Data['Year'].median()
year_std = Data['Year'].std()

print(f"Mean Year: {year_mean}")
print(f"Median Year: {year_median}")
print(f"Standard Deviation of Year: {year_std}")


In [None]:

# Histograms for Selling_Price and Present_Price
fig_prices = px.histogram(Data, x=['Selling_Price', 'Present_Price'], nbins=20,
                          title='Distribution of Selling Prices and Present Prices',
                          labels={'value': 'Price', 'variable': 'Variable'},
                          color_discrete_sequence=['#87CEFA', '#90EE90'])
fig_prices.update_layout(xaxis_title='Price', yaxis_title='Frequency')

# Box Plot for Selling_Price and Present_Price
fig_box_plot = px.box(Data.melt(value_vars=['Selling_Price', 'Present_Price']), x='variable', y='value',
                      title='Box Plot of Selling Prices and Present Prices',
                      labels={'value': 'Price', 'variable': 'Variable'},
                      color_discrete_sequence=['#87CEFA', '#90EE90'])
fig_box_plot.update_layout(xaxis_title='Variable', yaxis_title='Price')

# Show the plots
fig_prices.show()
fig_box_plot.show()


In [None]:

# Histogram for Kms_Driven
fig_kms_driven = px.histogram(Data, x='Kms_Driven', nbins=20, title='Distribution of Kilometers Driven',
                               labels={'Kms_Driven': 'Kilometers Driven', 'count': 'Frequency'},
                               color_discrete_sequence=['#ff7f0e'])
fig_kms_driven.update_layout(xaxis_title='Kilometers Driven', yaxis_title='Frequency')

# Show the histogram
fig_kms_driven.show()

# Summary Statistics for Kms_Driven
kms_mean = Data['Kms_Driven'].mean()
kms_median = Data['Kms_Driven'].median()
kms_std = Data['Kms_Driven'].std()

print(f"Mean Kilometers Driven: {kms_mean}")
print(f"Median Kilometers Driven: {kms_median}")
print(f"Standard Deviation of Kilometers Driven: {kms_std}")


In [None]:
# Count Plot for Owner
fig_owner = px.histogram(Data, x='Owner', title='Distribution of Number of Owners',
                         labels={'Owner': 'Number of Owners', 'count': 'Count'},
                         color_discrete_sequence=['#8dd3c7'])
fig_owner.update_layout(xaxis_title='Number of Owners', yaxis_title='Count')

# Show the plot
fig_owner.show()

# Value Counts for Owner
owner_counts = Data['Owner'].value_counts().reset_index()
owner_counts.columns = ['Number of Owners', 'Count']
print("Value Counts for Number of Owners:")
print(owner_counts)

# Categorical Variables

In [None]:

# Count Plot for Fuel_Type
fig_fuel_type = px.histogram(Data, x='Fuel_Type', title='Distribution of Fuel Types',
                              labels={'Fuel_Type': 'Fuel Type', 'count': 'Count'},
                              category_orders={'Fuel_Type': ['Petrol', 'Diesel', 'CNG']},
                              color_discrete_sequence=['#66c2a5'])
fig_fuel_type.update_layout(xaxis_title='Fuel Type', yaxis_title='Count')

# Count Plot for Seller_Type
fig_seller_type = px.histogram(Data, x='Seller_Type', title='Distribution of Seller Types',
                                labels={'Seller_Type': 'Seller Type', 'count': 'Count'},
                                color_discrete_sequence=['#fc8d62'])
fig_seller_type.update_layout(xaxis_title='Seller Type', yaxis_title='Count')

# Count Plot for Transmission
fig_transmission = px.histogram(Data, x='Transmission', title='Distribution of Transmission Types',
                                 labels={'Transmission': 'Transmission Type', 'count': 'Count'},
                                 color_discrete_sequence=['#8da0cb'])
fig_transmission.update_layout(xaxis_title='Transmission Type', yaxis_title='Count')

# Show the plots
fig_fuel_type.show()
fig_seller_type.show()
fig_transmission.show()


In [None]:

# Value Counts for Car Names
car_name_counts = Data['Car_Name'].value_counts().reset_index()
car_name_counts.columns = ['Car_Name', 'Count']

# Value Counts for Companies
company_counts = Data['company'].value_counts().reset_index()
company_counts.columns = ['Company', 'Count']

# Bar Chart for Car Names
fig_car_names = px.bar(car_name_counts, x='Car_Name', y='Count', title='Value Counts for Car Names',
                       labels={'Car_Name': 'Car Name', 'Count': 'Count'})

# Bar Chart for Companies
fig_companies = px.bar(company_counts, x='Company', y='Count', title='Value Counts for Companies',
                       labels={'Company': 'Company', 'Count': 'Count'})

# Show the plots
fig_car_names.show()
fig_companies.show()


# Combined Analysis

In [None]:
# Scatter Plot: Year vs. Selling_Price
fig = px.scatter(Data, x='Year', y='Selling_Price', color='Selling_Price',
                 title='Year vs. Selling Price',
                 labels={'Year': 'Manufacturing Year', 'Selling_Price': 'Selling Price'})

fig.update_layout(xaxis_title='Manufacturing Year', yaxis_title='Selling Price')
fig.show()

In [None]:
# Box Plot: Fuel_Type vs. Present_Price
fig = px.box(Data, x='Fuel_Type', y='Present_Price', color='Fuel_Type',
             title='Present Prices vs. Fuel Type',
             labels={'Present_Price': 'Present Price', 'Fuel_Type': 'Fuel Type'})

fig.update_layout(xaxis_title='Fuel Type', yaxis_title='Present Price')
fig.show()

In [None]:
# Violin Plot: Transmission vs. Kms_Driven
fig = px.violin(Data, x='Transmission', y='Kms_Driven', box=True, points="all",
                title='Distribution of Kilometers Driven by Transmission Type',
                labels={'Kms_Driven': 'Kilometers Driven', 'Transmission': 'Transmission Type'})

fig.update_layout(xaxis_title='Transmission Type', yaxis_title='Kilometers Driven')
fig.show()

# Multivariate Analysis

Multivariate analysis involves examining the relationships between multiple variables simultaneously. This type of analysis helps uncover patterns, associations, and interactions among variables. 

Correlation Matrix:

    A correlation matrix helps identify linear relationships between pairs of numerical variables. Positive values indicate a positive correlation, negative values indicate a negative correlation, and values near zero indicate a weak or no correlation.

In [None]:
correlation_matrix = Data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")


Pair Plots:

    Pair plots are useful for visualizing relationships between numerical variables. Each scatter plot in the matrix represents the relationship between two variables, and histograms along the diagonal show the distribution of each variable.

In [None]:
sns.pairplot(Data, diag_kind='kde')

Scatter Matrix:

    Similar to pair plots, a scatter matrix provides scatter plots for numerical variables. It can be helpful when you have more than two numerical variables.

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(Data, alpha=0.8, figsize=(12, 12), diagonal='kde')


Box Plots with Categorical Variables:

    Box plots can be used to compare the distribution of numerical variables across different categories.

In [None]:
sns.boxplot(x='Fuel_Type', y='Selling_Price', data=Data)


3D Scatter Plots:

    3D scatter plots are useful when you want to visualize the relationship between three numerical variables.

In [None]:
fig_3d = px.scatter_3d(Data, x='Year', y='Selling_Price', z='Kms_Driven', color='Fuel_Type')
fig_3d.show()


Categorical Plots with Two Variables:

    These plots help visualize the relationships between two categorical variables.

In [None]:
sns.countplot(x='Fuel_Type', hue='Transmission', data=Data)


In [None]:
# pip install nbconvert