In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from scipy import stats
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
df = pd.read_csv("../lego.population.csv", sep = ",", encoding = "latin1")
#df

## Rensking av datasett

In [None]:
# beholder kun forklaringsvariabler vi trenger
df2 = df[['Set_Name', 'Theme', 'Pieces', 'Price', 'Pages', 'Minifigures', 'Unique_Pieces', 'Ages']]

# fjerner observasjoner med manglende datapunkter
df2 = df2.dropna()

# gjør themes om til string og fjern alle tegn vi ikke vil ha med
df2['Theme'] = df2['Theme'].astype(str).str.replace(r'[^a-zA-Z0-9\s-]', '', regex = True)

# Gjør om price til float
df2['Price'] = df2['Price'].str.replace('\$', '', regex = True).astype(float)

df2 = df2[~df2['Set_Name'].str.contains("Calendar")]
#df2 = df2[df2['Theme'] != "DUPLO"]

# remove all DUPLO sets
#df2 = df2[~df2['Ages'].str.contains('Ages_2')]  
#df2 = df2[~df2['Ages'].str.contains('Ages_1½')]

df2

## Deler in i subsets med varmerke

In [None]:

trademarked = {
    'Spider-Man',
    'Powerpuff Girls',
    'Minions',
    'Stranger Things',
    'Star Wars',
    'Marvel',
    'Disney',
    'Harry Potter',
    'Minecraft',
    'Jurassic World',
    'Batman',
    'DC',
    'Trolls World Tour',
    'Overwatch',
    'LEGO Frozen 2'
}

uncertain = {
    'Unikitty',
    'Minifigures',
    'THE LEGO MOVIE 2',
    'Speed Champions',
    'Juniors',
    'Creator Expert'
}

notrademark = {
    'Monkie Kid',
    'Friends',
    'City',
    'NINJAGO',
    'DUPLO',
    'Creator 3-in-1',
    'Hidden Side',
    'Ideas',
    'Classic',
    'Powered UP'
}

# verify that the groups are correct
#assert (trademarked | uncertain | notrademark) == set(df2['Theme'])

df_trademark   = df2[df2['Theme'].isin(trademarked)]
df_uncertain   = df2[df2['Theme'].isin(uncertain)]
df_notrademark = df2[df2['Theme'].isin(notrademark)]

print(f"Length trademark:    {len(df_trademark)}")
print(f"Length uncertain:    {len(df_uncertain)}")
print(f"Length no trademark: {len(df_notrademark)}")




## 4.A) Pris beskrevet av antall brikker

In [None]:
# enkel lineær regresjon
formel = 'Price ~ Pieces'

model4a = smf.ols(formel, data = df2)
resultat = model4a.fit()

resultat.summary()


In [None]:
figure, axis = plt.subplots(1, 2, figsize = (15, 5))
sns.scatterplot(x = resultat.fittedvalues, y = resultat.resid, ax = axis[0])
axis[0].set_ylabel("Residual")
axis[0].set_xlabel("Predikert verdi")

sm.qqplot(resultat.resid, line = '45', fit = True, ax = axis[1])
axis[1].set_ylabel("Kvantiler i residualene")
axis[1].set_xlabel("Kvantiler i normalfordelingen")
plt.show()

## 4.B) Pris beskrivet i antall brikker og forklaringsvariabel

### 4.B.1) Pris beskrevet i antall brikker og antall sider

In [None]:

# multippel lineær regresjon
# TODO: er kanskje meningen å bruke sample data som `data`, ikke df2
model4b1 = smf.ols('Price ~ Pieces + Pages' , data = df2)
result = model4b1.fit()
result.summary()


### 4.B.2) Pris beskrevet i antall brikker og minifigures 

In [None]:

# multippel lineær regresjon
#'Set_Name', 'Theme', 'Pieces', 'Price', 'Pages', 'Minifigures', 'Unique_Pieces'
model4b2 = smf.ols('Price ~ Pages' , data = df2)

result = model4b2.fit()
result.summary()


### 4.B.3) Jumbomodell

In [None]:
from IPython.display import display, Markdown

# multippel lineær regresjon
fields = ['Pieces', 'Pages', 'Minifigures', 'Unique_Pieces']

subplot_cols = subplot_rows = math.ceil(math.sqrt(len(fields)))

for i, field in enumerate(fields):
    model4b3 = smf.ols('Price ~ ' + field, data = df2)
    fit      = model4b3.fit()
    display(Markdown(f"## {field}"))
    display(fit.summary())
    
    slope    = fit.params[field]
    regression_x = np.array(df2[field])
    regression_y = slope * regression_x

    plt.subplot(subplot_cols, subplot_rows, i+1)
    plt.scatter(df2[field], df2['Price'], color=plt.cm.tab10(i), alpha=0.5)
    plt.plot(df2[field], regression_y, color=plt.cm.tab10(i), label=field)
        
    plt.xlabel(field)
    plt.ylabel('Pris [$]')
    plt.legend()
    plt.grid()
    display(plt.show())
    

In [None]:
figure, axis = plt.subplots(1, 2, figsize = (15, 5))
sns.scatterplot(x = result.fittedvalues, y = result.resid, ax = axis[0])
axis[0].set_ylabel("Residual")
axis[0].set_xlabel("Predikert verdi")

sm.qqplot(result.resid, line = '45', fit = True, ax = axis[1])
axis[1].set_ylabel("Kvantiler i residualene")
axis[1].set_xlabel("Kvantiler i normalfordelingen")
plt.show()

## 4.C.A) Pris beskrevet av antall brikker, per gruppe (3x LR)

In [None]:

subsets = [
    {
        "data": df_trademark, 
        "name": "trademarked",
        "color" : "red"
    },
    {
        "data": df_uncertain,
        "name": "uncertain",
        "color": "blue"
    },
    {
        "data": df_notrademark,
        "name": "no trademark",
        "color": "green"
    }
]

plots=[]

for i,subset in enumerate(subsets):
    data = subset['data']
    model = smf.ols('Price ~ Pieces' , data = data)
    fit = model.fit()
    display(Markdown(f'### {subset["name"]}'))
    display(fit.summary())
    
    slope = fit.params['Pieces']
    intercept = fit.params['Intercept']

    reg_x = np.array(data['Pieces'])
    reg_y = slope * reg_x + intercept

    # Plot scatter plot and regression line
    plt.subplot(1, len(subsets), i+1)
    plt.ylim([0, 800])
    plt.xlim([0, 7000])
    plt.grid()
    plt.title(subset['name'])
    plt.xlabel('Antall brikker')
    plt.ylabel('Pris')
    fig = plt.gcf()
    current_width, current_height = plt.gcf().get_size_inches()
    fig.set_figwidth(current_width * 1.5)
    plt.scatter(data['Pieces'], data['Price'], color=subset['color'], alpha=0.1)
    plt.plot(reg_x, reg_y, color=subset['color'], label=subset['name'])


plt.show()

## MLR

In [None]:
df2['Category'] = np.where(
    df2['Theme'].isin(
        trademarked
    ),
    'trademark',
    np.where(
        df2['Theme'].isin(
            notrademark
        ),
        'no trademark',
        np.where(
            df2['Theme'].isin(
                uncertain   
            ),
            'uncertain',
            'unaccounted for'
        )
    )
)
df2.groupby(['Category']).size().reset_index(name = 'Count')

# multippel lineær regresjon
modell3_mlr = smf.ols('Price ~ Pieces + Category' , data = df2)
modell3_mlr.fit().summary()

## MLR med eget stigningstall for hver kategori

In [None]:
df2['Category'] = np.where(
    df2['Theme'].isin(
        trademarked
    ),
    'trademark',
    np.where(
        df2['Theme'].isin(
            notrademark
        ),
        'no trademark',
        np.where(
            df2['Theme'].isin(
                uncertain   
            ),
            'uncertain',
            'unaccounted for'
        )
    )
)
df2.groupby(['Category']).size().reset_index(name = 'Count')

#df2 = df2[df2['Price'] <= 400]

#plt.figure(figsize=(12, 8))
#sns.boxplot(x='Category', y='Price', data=df2)
#plt.title('Box Plot of Prices by Theme Category')
#plt.xlabel('Theme Category')
#plt.ylabel('Price')
#display(plt.show())

for field in ["Pages", "Pieces", "Unique_Pieces", "Minifigures"]:
    display(Markdown(f'# {field}'))
    
    #fields = ['Pieces', 'Pages', 'Minifigures', 'Unique_Pieces']
    model = smf.ols(f'Price ~ Pieces * Pages * Category' , data = df2)
    fit = model.fit()
    
    display(Markdown('### MLR Summary'))
    display(fit.summary())
        
    # Regresjonslinjer
    intercept = [fit.params['Category[T.trademark]'],          fit.params['Category[T.uncertain]'], 0]          + fit.params['Intercept']
    slope     = [fit.params[f'{field}:Category[T.trademark]'], fit.params[f'{field}:Category[T.uncertain]'], 0] + fit.params[field]
    
    colors = [
        {'c': 'red',    'a': 1},
        {'c': 'orange', 'a': 1},
        {'c': 'blue',   'a': 1}
    ]
    
    for i, theme in enumerate(['trademark', 'no trademark', 'uncertain']):
        subset = df2[df2['Category'].isin([theme])]
        
        regression_x = np.array(subset[field])
        regression_y = slope[i] * regression_x + intercept[i]
    
        # Plot scatter plot and regression line
        plt.scatter(subset[field], subset['Price'], color=colors[i]['c'], alpha=colors[i]['a'])
        plt.plot(regression_x, regression_y, color=colors[i]['c'], alpha=colors[i]['a'], label=theme)
    
        
    plt.xlabel(field)
    plt.ylabel('Price [$]')
    plt.title('Kryssplott med regresjonslinjer')
    plt.legend()
    plt.grid()
    
    # Display
    display(Markdown('### Kryssplot'))
    plt.show()
    
    # QQ og residualplot
    figure, axis = plt.subplots(1, 2, figsize = (15, 5))
    sns.scatterplot(x = fit.fittedvalues, y = fit.resid, ax = axis[0])
    axis[0].set_ylabel("Residual")
    axis[0].set_xlabel("Predikert verdi")
        
    sm.qqplot(fit.resid, line = '45', fit = True, ax = axis[1])
    axis[1].set_ylabel("Kvantiler i residualene")
    axis[1].set_xlabel("Kvantiler i normalfordelingen")
    
   
    display(Markdown('### QQ og residualplot'))
    plt.show()
    df2[f'price_per_{field}'] = df2['Price'] / df2[field]


for field in ["Pages", "Pieces", "Unique_Pieces", "Minifigures"]:
    display(Markdown(f'### Sorted by price_per_{field}'))
    display(df2.sort_values(by=[f'price_per_{field}']))

