<a href="https://colab.research.google.com/github/pavansai26/UNITPRICE-PREDICTION-USING-MACHINE-LEARNING/blob/main/UNIT_PRICE_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **problem statement**

# predicting the unit price

# **Attribute Description**

# Invoice No - Invoice ID, encoded as Label
#StockCode - Unique code per stock, encoded as Label

#Description - The Description, encoded as Label

#Quantity - Quantity purchased

#InvoiceDate - Date of purchase

#UnitPrice - The target value, price of every product

#CustomerID - Unique Identifier for every Customer

#Country - Country of sales, encoded as Label

# importing the necessary libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# **Plotting style**

In [None]:

sns.set_style("whitegrid")
sns.set_palette("muted")


# **importingthe model building libraries**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline

# ignoring the warnings

In [None]:
import warnings
warnings.filterwarnings('ignore',category = DeprecationWarning)
warnings.filterwarnings('ignore',category = UserWarning)
warnings.filterwarnings('ignore',category = RuntimeWarning)
warnings.filterwarnings('ignore',category = FutureWarning)

# **setting the no.of rows and columns**

In [None]:
pd.set_option('display.max_rows',100000)
pd.set_option('display.max_columns',1000)

# **accesing the drive**

In [None]:
from google.colab import drive
drive.mount('/gdrive')

# **Read training Data**

In [None]:
df_train = pd.read_csv('/gdrive/My Drive/Colab Notebooks/Participants_Data_TGIH 2/Train.csv')

# **printing the data only top 5**

In [None]:
df_train.head()

# **Read testing data**

In [None]:
df_test = pd.read_csv('/gdrive/My Drive/Colab Notebooks/Participants_Data_TGIH 2/Test.csv')

# **printing the data only top 5**

In [None]:
df_test.head()

# **information about the data**

In [None]:
df_train.info()

# As we can see, the train dataset contains more than 250000 rows. Fortunately, the train set does not contain any null values, so it is one less thing to worry about.





# Also, most of the columns of the dataset are of numeric type. Only one column InvoiceDate is of object type. But we have yet to check the distribution of the columns which are encoded as Label in the dataset:

In [None]:
print('observations and features in the train dataset :') 
print(df_train.shape)

# **DATA CLEANING**

# **CHECKING FOR NULL VALUES**

In [None]:
data = df_train
data.isnull().sum()

# there are no null values in the data

# **CHECKING NULL VALUES IN THE TEST DATSET**

In [None]:
df_test.isnull().sum()

# Similarly for test set, there isn't any null values which is a good news. 

# **seperate Categorical columns**

In [None]:
cat_cols = data.select_dtypes(include= ['object','category']).columns.tolist()
print(cat_cols)

# **seperate numerical columns**

In [None]:
num_cols = data.select_dtypes(include=['int64','float64']).columns.tolist()
print(num_cols)

# **Get number of unique entries in each column with categorical data**

In [None]:
data['InvoiceDate'].nunique()

# **Drop irrelavent columns**

In [None]:
data_new = data.drop(['InvoiceDate','InvoiceNo'],axis=1, inplace= False)

# **printing the data only top 5**

In [None]:
data_new.head()

# **exploring the data**

In [None]:
data_new.describe().T

# **Check for skewness in the dataset**

In [None]:
data_new.skew()

In [None]:
fig = plt.figure(figsize=(8, 6))
data_new.UnitPrice.hist(bins = 30)

# From the above plot, it can be easily seen that the target column is heavily right skewed. This might help us to figure out the cross validation strategies for the dataset. With that being said, let's go ahead and check other columns.

In [None]:
corr = data_new.corr()

plt.figure(figsize= (10,9))
sns.heatmap(corr, annot = True, cmap = 'Blues')
plt.title("Feature Correlation Heatmap")
plt.show()

# We could see some non-zero correlation between Country and CustomerID column, though this small positive correlation did not conveyed anything meaningful. 

# This can be easily seen by an example: the country with code 35 has customers with larger customer ids.



# It might be interesting to see the correlation of the target column with the rest.

In [None]:
corr.UnitPrice.sort_values(ascending=False)


# It looks like the target column has a very small positive correlation with StockCodecolumn. Rest of the columns have almost negligible dependence on UnitPrice.

# **Checking the country column for unique countries**

In [None]:
print(data_new.Country.unique())

# **Filter out the top 5 countries**

In [None]:
top5_countries = data_new.Country.value_counts().iloc[:5].index.tolist()
print(top5_countries)

In [None]:
fig = plt.figure(figsize=(10, 10))
sns.countplot(y = data_new.Country)
plt.show()

# From above, it looks like Country column is categorical where the numbers might represent the country code.

#  This means we need to further process it appropriately before putting it in the machine learning model. 

# We also need to check other columns which are label encoded.

In [None]:
data_new['CustomerID'].nunique()

# There are around 4339 unique customers spreaded across 37 different countries.

# **Filter out the top 5 customers**

In [None]:
top5_customers = data_new.CustomerID.value_counts().iloc[:5].index.tolist()
print(top5_customers)

# **Dataframe containing only top customers**

In [None]:
top_customer_countries_df = data_new[data_new['CustomerID'].isin(top5_customers)]
top_customer_countries_df


# **top5 customers belonging to which country and dataframe**

In [None]:
top_customer_countries_df.head()


In [None]:
top_customer_countries_df.nunique()

In [None]:
fig = plt.figure(figsize=(15, 15))
sns.catplot(x="CustomerID", y="UnitPrice", hue="Country", alpha=0.5, markers=["^", "o"], linestyles=['-', '-.'], kind="point", height=6, aspect=0.9,
            data=top_customer_countries_df)

plt.show()

# From the above plot, there is only one top buyer from country with country code 10 and rest of them belong to country code 35. Also, the vertical bar on each data point shows, the spread of the unit price for the items they bought.

# **Take dependent variable into y**





In [None]:
y = data_new['UnitPrice']
y.head(2)

In [None]:
y = y.values.reshape(-1,1)

# **Take independent variable into x**

In [None]:
x = data_new.drop('UnitPrice',axis = 1)
x.head(2)

# **DATA Transformation**

# **transforming the data into the normal distribution to improve the performace of the model**

# **To transform the data into normal distribution by using Power transformer method**

# **power transform the raw data**

In [None]:
power = PowerTransformer(method='yeo-johnson', standardize=True)
y = power.fit_transform(y)

# **fitting different models using sklearn pipeline**

In [None]:
from numba import jit
pipelines = []

pipelines.append(('ScaledLR', Pipeline([('Scaler', MinMaxScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSo', Pipeline([('Scaler', MinMaxScaler()), ('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', MinMaxScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', MinMaxScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', MinMaxScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', MinMaxScaler()),('GBM', GradientBoostingRegressor())])))


results = []
names = []


for name, model in pipelines:
  kfold = KFold(n_splits=10, random_state=21)
  cv_results = cross_val_score(model, x, y, cv=kfold, scoring='neg_mean_squared_error')
  results.append(cv_results)
  names.append(name)
  cv_info = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
  print(cv_info)


# **Drop irrelavent columns in test data**

In [None]:
df_test_new = df_test.drop(['InvoiceDate','InvoiceNo'],axis=1, inplace= False)
df_test_new.head()

# **running for different K values to know which yields the max accuracy.**

In [None]:
score = []

for i in range(1,20):  
  clf = KNeighborsRegressor(n_neighbors = i,  weights = 'distance', p=1)
  clf.fit(x, y)
  score.append(clf.score(x, y))

# **printing the max accuracy value**

In [None]:
k_max = score.index(max(score))+1
print( "At K = {}, Max Accuracy = {}".format(k_max, max(score)*100))

# **fitting the data to knn using which k value we got max accuracy**

In [None]:
clf = KNeighborsRegressor(n_neighbors = k_max,  weights = 'distance', p=1)
clf.fit(x, y)
print(clf.score(x, y))   


# **predictions using test data**

In [None]:
predictions = clf.predict(df_test_new)

# **converting predictions to data frame**

In [None]:
predict_df = pd.DataFrame(data=predictions)
predict_df.head()

# **Applying inverse power transform to get back original form of unit price**

In [None]:
inverse_transformed = power.inverse_transform(predict_df)
inverse_transformed[:5]
#due to array i used this

# **printing the final unitprice**

In [None]:
final_result = pd.DataFrame(data=inverse_transformed).round(2)
final_result.head()


# **converting final predictions to excel sheet**

In [None]:
final_result.to_excel("UnitPricePrediction.xlsx")