In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

pd.options.plotting.backend = "plotly"

# Quesion 1

## Part a

In [2]:
df = pd.read_csv("dataset1/dataset1.csv")
fig = df.plot(x='x', y='y', kind='scatter')
fig.show()
fig.write_image('images/implementation/q1/p1.png')

## Part b

In [3]:
tag_rows = ['partial 1'] * int(np.ceil(len(df)/3)) + \
           ['partial 2'] * int(np.ceil(len(df)/3)) + \
           ['partial 3'] * int(np.ceil(len(df)/3))
tagged_df = df.join(pd.DataFrame({'tag': tag_rows}))
fig = px.scatter(tagged_df, x="x", y="y", color="tag")
fig.show()
fig.write_image('images/implementation/q1/p2.png')

# Part c

In [4]:
train, test = train_test_split(df, train_size=0.8)

In [5]:
def dmse_dwi(y_real, x, w, dwi):
    n = len(x)
    degree = len(w) - 1
    power_of_dwi = degree - dwi
    y_predicted = np.polyval(w, x)
    return (-2/n) * np.sum((y_real - y_predicted) * x**power_of_dwi)


def dmae_dwi(y_real, x, w, dwi):
    n = len(x)
    degree = len(w) - 1
    power_of_dwi = degree - dwi
    y_predicted = np.polyval(w, x)
    return (1/n) * np.sum(np.sign(y_predicted - y_real) * x**power_of_dwi)


def drmse_dwi(y_real, x, w, dwi):
    n = len(x)
    degree = len(w) - 1
    power_of_dwi = degree - dwi
    y_predicted = np.polyval(w, x)

    numerator = np.sum((y_real - y_predicted) * x**power_of_dwi)
    denominator = np.sqrt(np.sum((y_real - y_predicted)**2))
    return (-1) * (1/np.sqrt(n)) * (numerator/denominator)


def error_function_to_text(error_function):
    if error_function == drmse_dwi:
        return "RMSE"
    elif error_function == dmae_dwi:
        return "MAE"
    elif error_function == dmse_dwi:
        return "MSE"

In [6]:
def gradient_descent(learning_rate, degree, iteration, derror_dwi, train):
    w = np.random.rand(degree)

    for i in range(iteration):
        dw = np.zeros(degree)
        for d in range(degree):
            dw[d] = derror_dwi(y_real=train['y'], x=train['x'], w=w, dwi=d)
        w -= (learning_rate * ((iteration - i)/iteration) * dw)

    return w

In [None]:
for iteration in [5000, 10_000]:
    for degree in [5, 8, 10]:
        for derror_function in [dmae_dwi, drmse_dwi, dmse_dwi]:
            w = gradient_descent(0.1, degree, iteration, derror_function, train)

            fig = train.plot(x='x', y='y', kind='scatter')
            polynomial = np.poly1d(w)
            x_values = np.linspace(min(train['x']), max(train['x']))
            y_values = polynomial(x_values)
            fig.add_trace(go.Scatter(x=x_values, y=y_values, mode="lines"))
            fig.update_layout(
                showlegend=False,
                title={
                    'text': "degree={}, iterations={}, error_function={}".format(degree, iteration, error_function_to_text(derror_function)),
                    'x': 0.5
                },
                margin={
                    'l': 0,
                    'r': 0,
                    'b': 0
                }
            )
            fig.write_image('images/implementation/q1/part_c/{}_{}_{}.png'.format(
                degree, iteration, error_function_to_text(derror_function)))

## Part D

In [7]:
def mse(y_real, x, w):
    n = len(x)
    y_predicted = np.polyval(w, x)
    return (1/n) * np.sum((y_predicted - y_real)**2)


def mae(y_real, x, w):
    n = len(x)
    y_predicted = np.polyval(w, x)
    return (1/n) * np.sum(np.absolute(y_predicted - y_real))


def rmse(y_real, x, w):
    n = len(x)
    y_predicted = np.polyval(w, x)
    return (1/np.sqrt(n)) * np.sqrt(np.sum((y_predicted - y_real)**2))

In [8]:
def verbose_gradient_descent(learning_rate, degree, iteration, derror_dwi, train, test, error_function):
    w = np.random.rand(degree)
    step_size = np.zeros(iteration)
    test_error = np.zeros(iteration)
    train_error = np.zeros(iteration)

    for i in range(iteration):
        dw = np.zeros(degree)
        for d in range(degree):
            dw[d] = derror_dwi(y_real=train['y'], x=train['x'], w=w, dwi=d)
        w -= (learning_rate * ((iteration-i)/iteration) * dw)

        step_size[i] = np.absolute(np.average(learning_rate * dw))
        test_error[i] = np.absolute(error_function(y_real=test['y'], x=test['x'], w=w))
        train_error[i] = np.absolute(error_function(y_real=train['y'], x=train['x'], w=w))

    return w, train_error, test_error, step_size

In [None]:
for iteration in [5000, 10000]:
    for degree in [5, 8, 10]:
        for (derror_function, error_function) in [(dmae_dwi, mae), (drmse_dwi, rmse), (dmse_dwi, mse)]:
            w, train_error, test_error, step_size = verbose_gradient_descent(0.1, degree, iteration, derror_function, train, test, error_function)
            error_figure_dataframe = pd.DataFrame(data={
                'iteration': list(range(len(train_error))),
                'Train Error': train_error,
                'Test Error': test_error
            })
            error_figure = px.line(error_figure_dataframe, 
                        x='iteration', 
                        y=['Train Error', 'Test Error'])

            error_figure.update_layout({
                "title": {
                    'text': 'Change of Error in Iterations <br><sup>Degree={}, Iteration={}, Error Function={}</sup>'.format(degree, iteration, error_function_to_text(derror_function)),
                    'x': 0.5,
                    'font': {
                        'family': 'Nimbus Sans',
                        'size': 20
                    }
                },
            })
            error_figure.write_image('images/implementation/q1/part_d/error/{}_{}_{}.png'.format(degree, iteration, error_function_to_text(derror_function)))

            step_size_figure_dataframe = pd.DataFrame(data={
                'iteration': list(range(len(train_error))),
                'Step Size': step_size,
            })
            step_size_figure = px.line(step_size_figure_dataframe, 
                        x='iteration', 
                        y='Step Size')

            step_size_figure.update_layout({
                "title": {
                    'text': 'Change of Step Size in Iterations <br><sup>Degree={}, Iteration={}, Error Function={}</sup>'.format(degree, iteration, error_function_to_text(derror_function)),
                    'x': 0.5,
                    'font': {
                        'family': 'Nimbus Sans',
                        'size': 20
                    }
                },
            })
            step_size_figure.write_image('images/implementation/q1/part_d/step_size/{}_{}_{}.png'.format(degree, iteration, error_function_to_text(derror_function)))

## Part E

In [9]:
def calculate_w_based_on_normal_equation(degree, x, y):
    n = len(x)
    X = np.zeros((n, degree + 1))
    for i, xi in enumerate(x):
        for d in range(degree + 1):
            X[i][d] = (xi ** d)

    theta = np.linalg.inv(X.T @ X) @ X.T @ y
    return theta

In [17]:
normal_equation_train_erorr = list()
normal_equation_test_erorr = list()
gradient_descent_train_error = list()
gradient_descent_test_error = list()

for degree in [5, 8, 10]:
    w_normal_equation = np.flip(calculate_w_based_on_normal_equation(degree, train['x'], train['y']))
    normal_equation_train_erorr.append(mse(train['y'], train['x'], w_normal_equation))
    normal_equation_test_erorr.append(mse(test['y'], test['x'], w_normal_equation))
    w_gradient_descent = gradient_descent(0.1, degree, 5000, dmse_dwi, train)
    gradient_descent_train_error.append(mse(train['y'], train['x'], w_gradient_descent))
    gradient_descent_test_error.append(mse(test['y'], test['x'], w_gradient_descent))

train_error_fig = go.Figure(
    data=[
        go.Bar(name='Normal Equation', x=['5', '8', '10'], y=normal_equation_train_erorr),
        go.Bar(name='Gradient Descent', x=['5', '8', '10'], y=gradient_descent_train_error)
    ],
    layout={
        'title': {
            'text': 'Comparison of Train Error in Normal Equation and Gradient Descent',
            'x': 0.5
        },
        'barmode': 'group'
    }
)

train_error_fig.show()

test_error_fig = go.Figure(
    data=[
        go.Bar(name='Normal Equation', x=['5', '8', '10'], y=normal_equation_test_erorr),
        go.Bar(name='Gradient Descent', x=['5', '8', '10'], y=gradient_descent_test_error)
    ],
    layout={
        'title': {
            'text': 'Comparison of Test Error in Normal Equation and Gradient Descent',
            'x': 0.5
        },
        'barmode': 'group'
    }
)

test_error_fig.show()


# Part E

In [19]:
def calculate_w_based_on_normal_equation_with_lambda(degree, x, y, lambda_):
    n = len(x)
    X = np.zeros((n, degree + 1))
    for i, xi in enumerate(x):
        for d in range(degree + 1):
            X[i][d] = (xi ** d)

    theta = np.linalg.inv(X.T @ X + lambda_ * np.identity(degree+1)) @ X.T @ y
    return theta

In [24]:
train_error = list()
test_error = list()

for lambda_ in [0.01, 0.1, 0.3, 0.5, 0.7, 1]:
    w = np.flip(calculate_w_based_on_normal_equation_with_lambda(8, train['x'], train['y'], lambda_=lambda_))
    
    train_error.append(rmse(train['y'], train['x'], w))
    test_error.append(rmse(test['y'], test['x'], w))
    
    fig = train.plot(x='x', y='y', kind='scatter')
    polynomial = np.poly1d(w)
    x_values = np.linspace(min(train['x']), max(train['x']))
    y_values = polynomial(x_values)
    fig.add_trace(go.Scatter(x=x_values, y=y_values, mode="lines"))
    fig.update_layout(
        showlegend=False,
        title={
            'text': "degree=8, lambda={}".format(lambda_),
            'x': 0.5
        },
        margin={
            'l': 0,
            'r': 0,
            'b': 0
        }
    )
    fig.write_image('images/implementation/q1/part_e/{}.png'.format(lambda_))

train_error_fig = go.Figure(
    data=[
        go.Bar(name='Train', x=['0.01', '0.1', '0.3', '0.5', '0.7', '1'], y=train_error),
        go.Bar(name='Test', x=['0.01', '0.1', '0.3', '0.5', '0.7', '1'], y=test_error)
    ],
    layout={
        'title': {
            'text': 'Impact of lambda',
            'x': 0.5
        },
        'barmode': 'group'
    }
)

train_error_fig.show()

# Question 2

## Part a

In [61]:
df2 = pd.read_excel("dataset2/CSM_dataset.xlsx")

Finding number of missing values in each column

In [62]:
missing_value_per_column = df2.isnull().sum()
missing_value_per_column

Movie                   0
Year                    0
Ratings                 0
Genre                   0
Gross                   0
Budget                  1
Screens                10
Sequel                  0
Sentiment               0
Views                   0
Likes                   0
Dislikes                0
Comments                0
Aggregate Followers    35
dtype: int64

Filling the missing values.

In [63]:
for (col, number_of_missing_value) in missing_value_per_column.iteritems():
    if number_of_missing_value > 0:
        df2[col].fillna(np.mean(df2[col]), inplace=True)

For preprocessing we do:

1. Remove 'Movie' column
2. Rescale the data to [0,1] range.
3. Remove highly correlation features. (Features with correlation > 0.7 will removed)

In [64]:
# Remove 'Movie' column
df2 = df2.loc[:, df2.columns != 'Movie']

# Rescale data to [0, 1] range
for column in df2:
    column_max = max(df2[column])
    column_min = min(df2[column])
    column_values = df2[column].to_numpy()
    df2.update({column: (column_values - column_min)/(column_max - column_min)})
df2 = df2.copy()


## Part b

In [72]:
corr = df2.loc[:, df2.columns != 'Ratings'].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Year,Genre,Gross,Budget,Screens,Sequel,Sentiment,Views,Likes,Dislikes,Comments,Aggregate Followers
Year,1.0,-0.026277,0.122672,0.098427,0.248055,0.103996,0.229858,0.207959,0.084561,0.234496,0.043037,-0.033727
Genre,-0.026277,1.0,-0.197009,-0.264052,-0.143988,-0.224474,-0.010831,-0.032047,-0.043544,-0.03173,-0.099919,0.007913
Gross,0.122672,-0.197009,1.0,0.718992,0.579697,0.423711,-0.0171,0.176363,0.110432,0.161536,0.12596,0.290633
Budget,0.098427,-0.264052,0.718992,1.0,0.58561,0.464593,0.033604,0.114539,0.011696,0.096825,0.090508,0.162437
Screens,0.248055,-0.143988,0.579697,0.58561,1.0,0.266616,-0.016958,0.253188,0.158427,0.264582,0.191284,0.192167
Sequel,0.103996,-0.224474,0.423711,0.464593,0.266616,1.0,-0.106769,-0.042763,-0.036089,-0.059792,-0.069333,0.225717
Sentiment,0.229858,-0.010831,-0.0171,0.033604,-0.016958,-0.106769,1.0,0.064134,0.053883,0.040574,0.056372,-0.085929
Views,0.207959,-0.032047,0.176363,0.114539,0.253188,-0.042763,0.064134,1.0,0.677175,0.776105,0.710507,0.14862
Likes,0.084561,-0.043544,0.110432,0.011696,0.158427,-0.036089,0.053883,0.677175,1.0,0.470645,0.917492,0.077705
Dislikes,0.234496,-0.03173,0.161536,0.096825,0.264582,-0.059792,0.040574,0.776105,0.470645,1.0,0.579966,0.050994


# Part d

In [75]:
train2, test2 = train_test_split(df2, train_size=0.8)
selected_features = ['Year', 'Genre', 'Budget', 'Screens', 'Sequel', 'Sentiment', 'Comments', 'Aggregate Followers', 'Ratings']

In [79]:
def multivariate_gradient_descent(
    learning_rate,
    iteration,
    train, 
    test
):
    Y_train = train['Ratings'].to_numpy()
    X_train = train.loc[:, train.columns != 'Ratings'].to_numpy()
    Y_test = test['Ratings'].to_numpy()
    X_test = test.loc[:, test.columns != 'Ratings'].to_numpy()

    m = X_train.shape[0]
    n = X_train.shape[1]
    w = np.random.rand(n)

    train_error = np.zeros(iteration)
    test_error = np.zeros(iteration)
    step_size = np.zeros(iteration)
    for epoch in range(iteration):
        Y_predicted = X_train @ w

        descent = np.zeros(n)
        for j in range(n):
            for i in range(m):
                descent[j] += learning_rate * ((iteration - epoch)/iteration) * (1/m) * (Y_predicted[i] - Y_train[i]) * X_train[i][j]
            w[j] -= descent[j]
        
        train_error[epoch] = (1/len(Y_train)) * np.sum((Y_train - (X_train @ w))**2)
        test_error[epoch] = (1/len(Y_test)) * np.sum((Y_test - (X_test @ w))**2)
        step_size[epoch] = np.average(descent)

    return train_error, test_error, step_size

In [98]:
train_error, test_error, step_size = multivariate_gradient_descent(learning_rate=0.2,
                                                                   iteration=5000,
                                                                   train=train2,
                                                                   test=test2)

In [99]:
error_figure_dataframe = pd.DataFrame(data={
    'iteration': list(range(len(train_error))),
    'Train Error': train_error,
    'Test Error': test_error
})
error_figure = px.line(error_figure_dataframe, 
              x='iteration', 
              y=['Train Error', 'Test Error'])

error_figure.update_layout({
    "title": {
        'text': 'Change of Error in Iterations',
        'x': 0.5,
        'font': {
            'family': 'Nimbus Sans',
            'size': 20
        }
    },
})
error_figure.show()

print("Final Error in Train Dataset:", train_error[-1])
print("Final Error in Test Dataset:", test_error[-1])


Final Error in Train Dataset: 0.026573915791594304
Final Error in Test Dataset: 0.03238654569554243


In [100]:
step_size_figure_dataframe = pd.DataFrame(data={
    'iteration': list(range(len(train_error))),
    'Step Size': step_size,
})
step_size_figure = px.line(step_size_figure_dataframe, 
              x='iteration', 
              y='Step Size')

step_size_figure.update_layout({
    "title": {
        'text': 'Change of Step Size in Iterations',
        'x': 0.5,
        'font': {
            'family': 'Nimbus Sans',
            'size': 20
        }
    },
})
step_size_figure.show()

In [101]:
train_error, test_error, step_size = multivariate_gradient_descent(learning_rate=0.15,
                                                                   iteration=5000,
                                                                   train=train2[selected_features],
                                                                   test=test2[selected_features])

In [102]:
error_figure_dataframe = pd.DataFrame(data={
    'iteration': list(range(len(train_error))),
    'Train Error': train_error,
    'Test Error': test_error
})
error_figure = px.line(error_figure_dataframe, 
              x='iteration', 
              y=['Train Error', 'Test Error'])

error_figure.update_layout({
    "title": {
        'text': 'Change of Error in Iterations',
        'x': 0.5,
        'font': {
            'family': 'Nimbus Sans',
            'size': 20
        }
    },
})
error_figure.show()

print("Final Error in Train Dataset:", train_error[-1])
print("Final Error in Test Dataset:", test_error[-1])

Final Error in Train Dataset: 0.029697076085470047
Final Error in Test Dataset: 0.042752557175889654


In [103]:
step_size_figure_dataframe = pd.DataFrame(data={
    'iteration': list(range(len(train_error))),
    'Step Size': step_size,
})
step_size_figure = px.line(step_size_figure_dataframe, 
              x='iteration', 
              y='Step Size')

step_size_figure.update_layout({
    "title": {
        'text': 'Change of Step Size in Iterations',
        'x': 0.5,
        'font': {
            'family': 'Nimbus Sans',
            'size': 20
        }
    },
})
step_size_figure.show()