In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import mean_squared_error, r2_score

In [16]:
data=pd.read_excel("Adidas US Sales Datasets.xlsx")

In [17]:
data.head()

Unnamed: 0,Retailer,Retailer ID,Invoice Date,Region,State,City,Product,Price per Unit,Units Sold,Total Sales,Operating Profit,Operating Margin,Sales Method
0,Foot Locker,1185732,2020-01-01,Northeast,New York,New York,Men's Street Footwear,50.0,1200,600000.0,300000.0,0.5,In-store
1,Foot Locker,1185732,2020-01-02,Northeast,New York,New York,Men's Athletic Footwear,50.0,1000,500000.0,150000.0,0.3,In-store
2,Foot Locker,1185732,2020-01-03,Northeast,New York,New York,Women's Street Footwear,40.0,1000,400000.0,140000.0,0.35,In-store
3,Foot Locker,1185732,2020-01-04,Northeast,New York,New York,Women's Athletic Footwear,45.0,850,382500.0,133875.0,0.35,In-store
4,Foot Locker,1185732,2020-01-05,Northeast,New York,New York,Men's Apparel,60.0,900,540000.0,162000.0,0.3,In-store


In [18]:
X = data[['Units Sold','Operating Profit','Operating Margin','Price per Unit']] 
y = data['Total Sales'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
l_model = LinearRegression()
l_model.fit(X_train, y_train)
y_predicted = l_model.predict(X_test)
y_predicted

array([-32452.3596316 ,  10921.45334933,  29527.67958081, ...,
        40626.5680679 ,  37287.21889737, -12338.36690407])

In [20]:
l_model.score(X_test,y_test)

0.957043926267903

In [21]:
R_model = RandomForestRegressor (n_estimators = 5,random_state=0)
R_model.fit(X_train, y_train)
y_predicted = R_model.predict(X_test)

In [22]:
R_model.score(X_test,y_test)

0.999158769922408

In [23]:
D_model = tree.DecisionTreeRegressor()
D_model.fit(X_train, y_train)
y_predicted = D_model.predict(X_test)

In [24]:
D_model.score(X_test,y_test)

0.9990783847906177

In [25]:
k_model = KNeighborsRegressor(n_neighbors=5)
k_model.fit(X_train, y_train)
y_predicted = k_model.predict(X_test)

In [26]:
k_model.score(X_test,y_test)

0.9386249441672256

In [27]:
model_names = ['Linear Regression', 'Random Forest', 'Decision Tree', 'K-Nearest Neighbors']

mse_values = []
r2_values = []

for model, name in zip([LinearRegression(), RandomForestRegressor(n_estimators=5, random_state=0),
                        DecisionTreeRegressor(), KNeighborsRegressor(n_neighbors=5)], model_names):
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    mse = mean_squared_error(y_test, y_predicted)
    r2 = r2_score(y_test, y_predicted)
    mse_values.append(mse)
    r2_values.append(r2)

comparison_table = pd.DataFrame({
    'Model': model_names,
    'Mean Squared Error': mse_values,
    'R-squared': r2_values
})

print(comparison_table)

                 Model  Mean Squared Error  R-squared
0    Linear Regression        8.143064e+08   0.957044
1        Random Forest        1.594697e+07   0.999159
2        Decision Tree        1.887324e+07   0.999004
3  K-Nearest Neighbors        1.163470e+09   0.938625


In [28]:
#input1=[1000,150000,0.3,50]->500000

new_data = [[1200, 300000, 0.5, 50]]  

linear_regression_prediction = l_model.predict(new_data)

knn_prediction = k_model.predict(new_data)

decision_tree_prediction = D_model.predict(new_data)

random_forest_prediction = R_model.predict(new_data)

comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest'],
    'Predicted Total Sales': [linear_regression_prediction[0], knn_prediction[0], decision_tree_prediction[0], random_forest_prediction[0]]
})

print(comparison_df)

               Model  Predicted Total Sales
0  Linear Regression          707050.475163
1                KNN          592000.000000
2      Decision Tree          600000.000000
3      Random Forest          604000.000000




Conclusion

A lower MSE indicates better performance. From the provided MSE values, we observe that Random Forest and Decision Tree models have the lowest MSE.This metric represents the proportion of the variance in the dependent variable that is predictable from the independent variables. R-squared values range from 0 to 1, where higher values indicate better model fit.From the above metrics, Random Forest and Decision Tree models perform better in terms of both MSE and R-squared compared to Linear Regression and K-Nearest Neighbors.