<a href="https://colab.research.google.com/github/mjdabendoh/Finance-Tradin/blob/main/Analysis_SalesProfit_Superstore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Analysis of sales-profit for superstore sales data from tableau user group using TFX

In [None]:
# For data analysis & Visualization
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow_data_validation as tfdv

from google.colab import files
from google.colab import data_table
from datetime import datetime


# For the model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import set_config


%matplotlib inline

print('TF version:', tf.__version__)
print('TFDV version:', tfdv.__version__)

print("\nAll the module are imported \nSo, enjoy your analyse !!!")

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
base_excelx ='/content/Sample - Superstore.xlsx'

df = pd.read_excel(base_excelx)
df.head()

In [None]:
from google.colab import data_table


data_table.enable_dataframe_formatter()

DataFrame = df
DataFrame.head()

In [None]:
DataFrame.info()

In [None]:
# @title Discount vs Profit

from matplotlib import pyplot as plt
DataFrame.plot(kind='scatter', x='Discount', y='Profit', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
DataFrame = DataFrame.drop(['Row ID','Order ID','Customer ID','Customer Name','Product ID'], axis=1)
DataFrame.head()

In [None]:
# convert the columns date in to a date time columns
DataFrame['Order Date'] =pd.to_datetime(df['Order Date'], format ='%Y-%m%-d')
DataFrame.info()

In [None]:
# Now we will parse the date
DataFrame['Order_Date_year'] = df['Order Date'].dt.year
DataFrame['Order_Date_month']= df['Order Date'].dt.month
DataFrame['Order_Date_day']  = df['Order Date'].dt.day

DataFrame['Ship_Date_year'] = df['Ship Date'].dt.year
DataFrame['Ship_Date_month'] = df['Ship Date'].dt.month
DataFrame['Ship_Date_day'] = df['Ship Date'].dt.day

DataFrame.info()

In [None]:
# Most of the features are categorical , they have to be converted to numerical by
# One-Hot-Encoding from pandas get_dummies

encoded_df = pd.get_dummies(DataFrame, columns =
                    ['Ship Mode', 'Segment', 'Country', 'City', 'State', 'Postal Code',
                     'Region', 'Category', 'Sub-Category', 'Product Name', 'Quantity',
                     'Order_Date_year', 'Order_Date_month', 'Order_Date_day',
                     'Ship_Date_year', 'Ship_Date_month', 'Ship_Date_day'
                    ])
encoded_df.head()
# Quantity amount is numerical but is considered as categorical
# Because quantity amount varies between 1 and 14 and that too in whole integer numbers

In [None]:
DataFrame = DataFrame.drop(['Order Date', 'Ship Date'], axis =1)
DataFrame.info()

In [None]:
# Convert the Pandas DataFrame to a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices(dict(DataFrame))

# Calculate statistics using tfdv.generate_statistics_from_dataframe
stats = tfdv.generate_statistics_from_dataframe(DataFrame)

# Visualize the statistics
tfdv.visualize_statistics(stats)

In [None]:
# Display a schema
schema = tfdv.infer_schema(stats)
tfdv.display_schema(schema)

Building the model
We will use Lasso regression model to identify the most important feature as Lasso Zero out the
no important coefficients and we can avoid overfitting

In [59]:
# First we divide the dataset into target and features:

DataFrame_x = encoded_df[(x for x in encoded_df if x not in ('Profit'))]
DataFrame_y = encoded_df[['Profit']]

In [None]:
df_x.head()

In [None]:
# For fitting the model in Lasso all the features must be scaled by StandardScaler via a pipeline

lasso_model = Lasso()
lasso_pl = Pipeline(steps =[('scaler', StandardScaler()),('lasso', lasso_model)])
lasso_pl.fit(DataFrame_x, DataFrame_y)
pred_y = lasso_pl.predict(DataFrame_x)
lasso_model_r2val = r2_score(DataFrame_y, pred_y)

print(f"Lasso can explain: {lasso_model_r2val}")