### Checking for multiculinarity to see if OLS is a good model

To check for no multicollinearity, you can use the Variance Inflation Factor (VIF) which measures how much the variance of the estimated regression coefficients are increased because of collinearity. A VIF of 1 means that there is no correlation between the predictor variable and the other predictor variable, a VIF greater than 1 means that there is a correlation, and the greater the VIF, the stronger the correlation. VIF can be calculated by using the library 'statsmodels' in python.

In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import numpy as np


In [None]:
df = pd.read_excel('../used_data/cleaned-dataset.xlsx')

In [None]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
vif["features"] = df.columns

### Other methods to check for multicollinearity

In [None]:
df_industry = df[df['Has Industry']==1]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# define the independent variables (x) and the dependent variable (y)
x = df_industry[['Average Payout', 'Hall of Famers',
       'Number People', 'Maximum Reword', 'annocument_count','Reward Range Average',
       'Validation Within Hours', 'P1 Average', 'P2 Average', 'P3 Average',
       'P4 Average', 'Is Private', 'Is Safe Harbor_Partial safe harbor',
       'Is Safe Harbor_Safe harbor', 'Is Safe Harbor_Not Safe Harbor']]
y = df_industry['Vulnearbilities Rewarded']


# Splitting your data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)



# instantiate the LinearRegression object
reg = LinearRegression()

# fit the linear regression model
reg.fit(x_train, y_train)

# print the coefficients of the independent variables
print(reg.coef_)

# print the intercept of the model
print(reg.intercept_)

coef = reg.coef_
coef_abs = np.abs(coef)
top_n = np.argsort(coef_abs)[-7:]

# get the names of the independent variables that correspond to the top n coefficients
top_n_features = x.columns[top_n]

print(top_n_features)

In [None]:
# Predicting the test set results
y_pred = reg.predict(x_test)

# Evaluating the performance of the model
from sklearn.metrics import mean_squared_error, r2_score
print("Mean squared error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

In [None]:
import statsmodels.api as sm
import numpy as np

# define the independent variables (x) and the dependent variable (y)
x = df_industry[['Average Payout', 'Hall of Famers',
       'Number People', 'Maximum Reword', 'annocument_count','Reward Range Average',
       'Validation Within Hours', 'P1 Average', 'P2 Average', 'P3 Average',
       'P4 Average', 'Is Private', 'Is Safe Harbor_Partial safe harbor',
       'Is Safe Harbor_Safe harbor', 'Is Safe Harbor_Not Safe Harbor']]
y = df_industry['Vulnearbilities Rewarded']

# Splitting your data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# instantiate the LinearRegression object
reg = LinearRegression()

# fit the linear regression model
reg.fit(x_train, y_train)

# Check for linearity
for i in x.columns:
    plt.scatter(x_train[i], y_train)
    plt.xlabel(i)
    plt.ylabel('y')
    plt.show()

# Normality of errors
residuals = y_test - reg.predict(x_test)
sm.qqplot(residuals)

# Homoscedasticity
for i in x.columns:
    plt.scatter(reg.predict(x_test),residuals)
    plt.xlabel
