In [36]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [43]:
class DataHandler : 
    def __init__(self, file_path) :
        self.dataset = pd.read_csv(file_path)

        self.pre_process()
        
    def normalize_data(self, df) : 
        return (df - df.min()) / (df.max() - df.min())

    def pre_process(self) :
        self.dataset.drop(columns=['Date'], inplace=True)
        self.dataset.dropna(inplace=True)
        self.dataset.drop_duplicates(inplace=True)

        self.dataset = self.dataset.sample(random_state=50, frac=1)
        self.dataset = self.normalize_data(self.dataset)

    def get_dataset(self) :
        return self.dataset


In [65]:

class Clustering : 
    def __init__(self) :
        self.trained = False
        self.linear_regression = LinearRegression()
        self.train = DataHandler('../dataset/train.csv')
        self.test = DataHandler('../dataset/test.csv')

        self.train_dataset = self.train.get_dataset()
        self.test_dataset = self.test.get_dataset()

    def train_model(self) : 
        x_train = self.train_dataset.drop(columns=['number_sold'])
        y_train = self.train_dataset['number_sold']

        self.linear_regression.fit(x_train, y_train)
        self.trained = True

    def evalute_model(self) : 
        if not self.trained : 
            print('Please train the model first by calling the train_model method')
            return

        x_test = self.test_dataset.drop(columns=['number_sold'])
        y_test = self.test_dataset['number_sold']
        prediction = self.linear_regression.predict(x_test)

        MSE = mean_squared_error(y_test, prediction)

        counter = 0
        
        print('Showing only 10 prediction')

        for real_value, prediction_value in zip(y_test, prediction) : 
            counter += 1

            if counter > 10 : 
                break

            
            print("================================")
            print(f"Real value: {real_value}")
            print(f"Prediction: {prediction_value}")
            print('')

        print(f"Mean Square Error using linear regression (after normalization) : {MSE}")
    

clustering = Clustering()

clustering.train_model()
clustering.evalute_model()

Showing only 10 prediction
Real value: 0.541095890410959
Prediction: 0.5577311766476138

Real value: 0.8664383561643836
Prediction: 0.5291698690507844

Real value: 0.8732876712328768
Prediction: 0.6169056060059384

Real value: 0.6164383561643836
Prediction: 0.5796651108864023

Real value: 0.7545662100456622
Prediction: 0.6388395402447269

Real value: 0.6689497716894978
Prediction: 0.6554079736398291

Real value: 0.7842465753424658
Prediction: 0.6541461011254744

Real value: 0.1415525114155251
Prediction: 0.6049127318042112

Real value: 0.7853881278538812
Prediction: 0.650832414446454

Real value: 0.8299086757990868
Prediction: 0.6674008478415562

Mean Square Error using linear regression (after normalization) : 0.05187177089940817
