<a href="https://colab.research.google.com/github/nethajisubash/AI-ML-Projects/blob/main/Week9-Jan22-LinAlg2/PreClass_Linear_Algebra_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Simple Linear Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
boston_data_path = "/content/drive/MyDrive/AIConsulting/IK_Arthan_Aujan_Innosential/INTERVIEW KICKSTART/Final Notebooks-make changes here/datasets/Boston.csv"

In [None]:
# reading the dataset
df = pd.read_csv(boston_data_path)
df.head()

In [None]:
# checking the chape of dataset
df.shape

In [None]:
Y = df['MEDV']
X = df[['RM']]

In [None]:
# visualizing the data
plt.scatter(X, Y)
plt.xlabel('RM')
plt.ylabel('MEDV')
plt.grid(True)
plt.show()

In [None]:
# splitting training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.75 , random_state = 0)

In [None]:
# applying Linear Regression
model = LinearRegression()
model.fit(X_train, Y_train)

In [None]:
# printing the coefficients
# ypred = a + b * x
print('a = ', model.intercept_)
print('b = ', model.coef_)

In [None]:
# Making predictions on the test set
Y_pred = model.predict(X_test)
Y_pred.shape

In [None]:
# visualizing the predictions
plt.plot([x for x in range(Y_test.shape[0])][:75], Y_test[:75], color = "blue", linewidth = 1, linestyle = "-")
plt.plot([x for x in range(Y_test.shape[0])][:75], Y_pred[:75], color = "red",  linewidth = 1, linestyle = "-.")
plt.title('Actual value vs Predicted value')
plt.xlabel('Index')
plt.ylabel('Sales')
plt.grid(True)

In [None]:
# visualizing the error terms
plt.plot([x for x in range(Y_test.shape[0])], Y_test - Y_pred, color = "red", linewidth = 1, linestyle = "-")
plt.title('Errors')
plt.xlabel('Index')
plt.ylabel('Sales')
plt.grid(True)

In [None]:
# mean squared error computation
mse = mean_squared_error(Y_test, Y_pred)
print(mse)

In [None]:
# r2 value computation
r2 = r2_score(Y_test, Y_pred)
print(r2)

In [None]:
plt.scatter(X_test, Y_test, c = 'blue')
plt.plot(X_test, Y_pred, c = 'red')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid()
plt.show()

# Multiple Linear Regression using SK-LEARN

In [None]:
import numpy as np
import pandas as pd
import sklearn.datasets as datasets
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# reading the dataset
df = pd.read_csv(boston_data_path)
df.head()

In [None]:
df.columns

In [None]:
Y = df['MEDV']
X = df[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']]

In [None]:
df

In [None]:
# visualizing the data
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True,cmap="YlGnBu")

In [None]:
# splitting variables and generating training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 101)

In [None]:
# Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Making Predictions
model.predict(X_test)
pred = model.predict(X_test)

In [None]:
# Evaluating Model's Performance
print('Mean Absolute Error:', mean_absolute_error(y_test, pred))
print('Mean Squared Error:', mean_squared_error(y_test, pred))
print('Mean Root Squared Error:', np.sqrt(mean_squared_error(y_test, pred)))

#Multiple Linear Regression using STATSMODELS

In [None]:

# Import libraries for data manipulation
import pandas as pd
import numpy as np

# Import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import ProbPlot

# Import libraries for building linear regression model using statsmodel
from statsmodels.formula.api import ols
import statsmodels.api as sm

# Importing Linear Regression from sklearn
from sklearn.linear_model import LinearRegression

# Import library for preparing data
from sklearn.model_selection import train_test_split

# Import library for data preprocessing
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

In [None]:
# reading the dataset
df = pd.read_csv(boston_data_path)
df.head()

**Observations:**
* The price of the house indicated by the variable MEDV is the target variable and the rest are the independent variables based on which we will predict house price.


In [None]:

#### **Get information about the dataset using the info() method**

df.info()

**Observations:**
* There are a total of 506 non-null observations in each of the columns. This indicates that there are no missing values in the data.

* Every column in this dataset is numeric in nature.

## **Exploratory Data Analysis**

#### **Let's now check the summary statistics of this dataset**

In [None]:
df.describe().transpose()

### **Let's check the correlation using the heatmap**

In [None]:
plt.figure(figsize = (12, 8))
cmap = sns.diverging_palette(230, 20, as_cmap = True)
sns.heatmap(df.corr(), annot = True, fmt = '.2f', cmap = cmap )
plt.show()


## **Model Building - Approach**

1. Data preparation
2. Partition the data into train and test set
3. Build model on the train data
4. Cross-validating the model
5. Test the data on test set




### **Split the dataset**
Let's split the data into the dependent and independent variables and further split it into train and test set in a ratio of 70:30 for train and test set.

# Separate the dependent and indepedent variable

In [None]:
Y = df['MEDV']
X = df.drop(columns = {'MEDV'})

# Add the intercept term
X = sm.add_constant(X)

# Splitting the data in 70:30 ratio of train to test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30 , random_state = 1)





### **Creating linear regression model using statsmodels OLS**

In [None]:
# Create the model
model1 = sm.OLS(y_train, X_train).fit()

# Get the model summary
model1.summary()



- We can see that the `R-squared` for the model is `0.71`.
- Not all the variables are statistically significant to predict the outcome variable. To check which are statistically significant or have predictive power to predict the target variable, we need to check the `p-value` against all the independent variables.

**Interpreting the Regression Results:**

1. **Adjusted. R-squared**: It reflects the fit of the model.
    - R-squared values range from 0 to 1, where a higher value generally indicates a better fit, assuming certain conditions are met.
    - In our case, the value for Adj. R-squared is **0.697**

2. **coeff**: It represents the change in the output Y due to a change of one unit in the variable (everything else held constant).
3. **std err**: It reflects the level of accuracy of the coefficients.
    - The lower it is, the more accurate the coefficients are.
4. **P >|t|**: It is p-value.
   
   * Pr(>|t|): For each independent feature there is a null hypothesis and an  alternate hypothesis

    Ho: Independent feature is not significant
   
    Ha: Independent feature is significant
    
   * A p-value of less than 0.05 is considered to be statistically significant.

   
5. **Confidence Interval**: It represents the range in which our coefficients are likely to fall (with a likelihood of 95%).



* Both the **R-squared and Adjusted R-squared of the model are around 71%**. This is a clear indication that we have been able to create a good model that can explain variance in the house prices for up to 71%.


In [None]:


#### **Checking the performance of the model on the train and test data set**

# RMSE
def rmse(predictions, targets):
    return np.sqrt(((targets - predictions) ** 2).mean())


# MAPE
def mape(predictions, targets):
    return np.mean(np.abs((targets - predictions)) / targets) * 100


# MAE
def mae(predictions, targets):
    return np.mean(np.abs((targets - predictions)))




In [None]:
# Model Performance on test and train data
def model_pref(olsmodel, x_train, x_test, y_train, y_test):

    # Insample Prediction
    y_pred_train = olsmodel.predict(x_train)
    y_observed_train = y_train

    # Prediction on test data
    y_pred_test = olsmodel.predict(x_test)
    y_observed_test = y_test

    print(
        pd.DataFrame(
            {
                "Data": ["Train", "Test"],
                "RMSE": [
                    rmse(y_pred_train, y_observed_train),
                    rmse(y_pred_test, y_observed_test),
                ],
                "MAE": [
                    mae(y_pred_train, y_observed_train),
                    mae(y_pred_test, y_observed_test),
                ],
                "MAPE": [
                    mape(y_pred_train, y_observed_train),
                    mape(y_pred_test, y_observed_test),
                ],
            }
        )
    )


# Checking model performance
model_pref(model1, X_train, X_test, y_train, y_test)

## Forward Feature Selection using SequentialFeatureSelector

**Why should we do feature selection?**

- Reduces dimensionality
- Discards deceptive features (Deceptive features appear to aid learning on the training set, but impair generalization)
- Speeds training/testing


**How does forward feature selection work?**

* It starts with an empty model and adds variables one by one.
* In each forward step, you add the one variable that gives the highest improvement to your model.



In [None]:
# reading the dataset
df = pd.read_csv(boston_data_path)
df.head()

In [None]:
X = df.drop(columns=['MEDV'])
Y = df[['MEDV']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.75 , random_state = 0)

In [None]:
!pip install mlxtend

In [None]:
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


reg = LinearRegression()

# Build step forward feature selection
sfs = SFS(
    reg,
    k_features=X_train.shape[1],
    forward=True,  # k_features denotes the number of features to select
    floating=False,
    scoring="r2",
    verbose=2,
    cv=5,
)

# Perform SFFS
sfs = sfs.fit(X_train, Y_train)

In [None]:
# to plot the performance with addition of each feature
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

fig1 = plot_sfs(sfs.get_metric_dict(), kind="std_err")
plt.title("Sequential Forward Selection (w. StdErr)")
plt.xticks(rotation=90)
plt.show()

- We can see that performance increases till the 5th feature and then slowly becomes constant, and then drops after the 10th feature is added.
- The decision to choose the *k_features* now depends on the adjusted $R^2$ vs the complexity of the model.
    - With 5 features, we are getting an adjusted $R^2$ of 0.7223757171836604
    - With 9 features, we are getting an adjusted $R^2$ of 0.7315777117242518.
    - With 10 features, we are getting an adjusted $R^2$ of 0.7307239154712025.
- The increase in adjusted $R^2$ is not very significant as we are getting the same values with a less complex model.
- So we'll use 5 features only to build our model, but you can experiment by taking a different number.
- Number of features chosen will also depend on the business context and use case of the model.

###Introduction to PCA
PCA stands for Principal Component Analysis, and it is a widely used technique in data analysis and machine learning. At its core, PCA is a way to reduce the complexity of high-dimensional data by identifying the most important patterns and trends in the data.

Imagine you have a dataset with many variables, such as age, height, weight, income, and education level, and you want to understand how these variables are related to each other. PCA can help you by finding the underlying structure of the data and identifying the key factors that explain most of the variation in the data.

To do this, PCA uses linear algebra to transform the data into a new coordinate system that captures the most important information in the data. The new coordinate system is called the principal components, and each principal component is a linear combination of the original variables.

By examining the principal components, you can identify the most important patterns in the data and understand how different variables contribute to these patterns. You can also use the principal components to visualize the data in a lower-dimensional space, which can help you identify clusters or groups of similar data points.

Overall, PCA is a powerful tool for exploring and analyzing complex datasets, and it can be applied to a wide range of fields, including biology, economics, psychology, and computer science.

### PCA Theory
PCA is based on the concept of linear algebra, specifically the eigenvalue decomposition of a covariance matrix. In simple terms, the covariance matrix is a measure of the linear relationship between pairs of variables. Note that PCA is applied on centered data.

Let X be an n x p matrix representing the centered data, where n is the number of observations and p is the number of variables. The covariance matrix of X is given by:

C = (1/n) * X^T * X

where ^T denotes the transpose of a matrix. The covariance matrix is a symmetric positive semi-definite matrix, which means that it has p real eigenvalues and p orthogonal eigenvectors.

The eigendecomposition of C is given by:

C = V * Lambda * V^T

where V is a p x p matrix whose columns are the eigenvectors of C, and Lambda is a diagonal matrix whose entries are the corresponding eigenvalues.

The eigenvectors in V are sorted in descending order according to their corresponding eigenvalues in Lambda. The first principal component is the linear combination of the variables that corresponds to the eigenvector with the largest eigenvalue. The second principal component is the linear combination that corresponds to the eigenvector with the second largest eigenvalue, and so on.

To compute the principal components of the data, we multiply the centered data matrix X by the matrix of eigenvectors V:

Y = X * V

where Y is the matrix of principal components. Each column of Y represents a principal component, and each row represents an observation.

The proportion of variance explained by each principal component is given by its corresponding eigenvalue divided by the sum of all eigenvalues:

prop_i = lambda_i / (sum(lambda))

where prop_i is the proportion of variance explained by the i-th principal component.

PCA can be used for data compression by selecting the top k principal components that explain the most variance in the data. The compressed data can be reconstructed by multiplying the matrix of selected principal components by the transpose of the matrix of eigenvectors:

X_hat = Y_k * V^T_k

where Y_k is the matrix of the k selected principal components, and V_k is the matrix of the corresponding k eigenvectors. X_hat is the reconstructed data, which should be close to the original data X.

### Data Preparation for PCA
Suppose we have a dataset with n observations and p variables. Before applying PCA, we need to perform the following data preparation steps:

1. Standardization: PCA is sensitive to the scale of the variables, so we need to standardize the data to have zero mean and unit variance. This can be done by subtracting the mean of each variable and dividing by its standard deviation:
X_standardized = (X - mean(X)) / std(X) (where X is the original data matrix, mean(X) is the mean vector of X, and std(X) is the standard deviation vector of X)

2. Missing value imputation: If the dataset has missing values, we need to impute them before applying PCA. There are different imputation methods that can be used, such as mean imputation, regression imputation, or multiple imputation.

3. Outlier detection: Outliers can affect the results of PCA, so it is important to detect and handle them before applying PCA. One way to detect outliers is by computing the Mahalanobis distance of each observation from the mean of the data. Observations with a large Mahalanobis distance are considered outliers.

4. Variable selection: If the dataset has a large number of variables, it may be necessary to perform variable selection before applying PCA. This can be done using various methods, such as correlation analysis, mutual information, or feature importance scores.

After performing these data preparation steps, we can apply PCA to the standardized data X_standardized to obtain the principal components and perform exploratory data analysis.

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# reading the dataset
df = pd.read_csv(boston_data_path)
df.head()

In [None]:
X = df.drop(columns=['MEDV'])
y = df[['MEDV']]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size = 0.75 , random_state = 0)

In [None]:
scaler = StandardScaler()
Xst = scaler.fit_transform(X)
xDf = pd.DataFrame(data = Xst, columns = X.columns)
xDf.head()

### Implementation of PCA

In [None]:
pca = PCA(n_components = None)
xDf_PCA = pca.fit(xDf)

In [None]:
plt.scatter(x = [i + 1 for i in range(len(xDf_PCA.explained_variance_ratio_))],
            y = xDf_PCA.explained_variance_ratio_,
            s = 200, alpha = 0.75, c = 'red', edgecolor = 'm')
plt.grid(True)
plt.title("Explained variance ratio of the fitted principal component vector")
plt.xlabel("Principal components")
plt.xticks([i+1 for i in range(len(xDf_PCA.explained_variance_ratio_))])
plt.yticks()
plt.ylabel("Variance ratio")
plt.show()

In [None]:
explained_variance_ratio=xDf_PCA.explained_variance_ratio_

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# xDf_PCA.explained_variance_ratio_ contains the explained variance ratio for each principal component
variance_ratio = xDf_PCA.explained_variance_ratio_

# Calculate cumulative variance ratio
cumulative_var_ratio = np.cumsum(variance_ratio)

# Create bar plot with cumulative variance ratio
plt.bar(range(1, len(cumulative_var_ratio)+1), cumulative_var_ratio, align='center')
plt.xticks(range(1, len(cumulative_var_ratio)+1))
plt.xlabel('Principal Component')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.show()



### Limitations of PCA
1. _Linearity assumption:_ PCA assumes that the data is linearly related. If the data has a nonlinear structure, PCA may not be the most appropriate technique.

2. _Loss of interpretability: After performing PCA, the principal components may not be directly interpretable in terms of the original variables. This can make it difficult to explain the results to non-technical stakeholders.

3. _Sensitivity to outliers:_ PCA is sensitive to outliers, which can distort the results and lead to incorrect conclusions.

4. _Sensitivity to scaling:_ PCA is sensitive to the scale of the variables, which can affect the results. It is important to standardize the variables before performing PCA.

5. _Difficulty in choosing the number of components:_ Choosing the number of components to retain can be a challenging task. If too few components are retained, important information may be lost. If too many components are retained, the results may be overfit and not generalize well to new data.

6. _Correlation-based:_ PCA assumes that variables are linearly correlated with each other. If the variables are not correlated, PCA may not be the most appropriate technique.

7. _Lack of robustness:_ PCA is not a robust technique and can be affected by outliers and influential observations.

## **Image Compression with SVD**

In [None]:
# import module
import requests
import cv2
import numpy as np
import matplotlib.pyplot as plt

# assign and open image
url = 'https://media.geeksforgeeks.org/wp-content/cdn-uploads/20210401173418/Webp-compressed.jpg'
response = requests.get(url, stream=True)

with open('image.png', 'wb') as f:
	f.write(response.content)

img = cv2.imread('image.png')

# Converting the image into gray scale for faster
# computation.
gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Calculating the SVD
u, s, v = np.linalg.svd(gray_image, full_matrices=False)

# inspect shapes of the matrices
print(f'u.shape:{u.shape},s.shape:{s.shape},v.shape:{v.shape}')

In [None]:
# import module
import seaborn as sns

var_explained = np.round(s**2/np.sum(s**2), decimals=6)

# Variance explained top Singular vectors
print(f'variance Explained by Top 20 singular values:\n{var_explained[0:20]}')

sns.barplot(x=list(range(1, 21)),
			y=var_explained[0:20], color="dodgerblue")

plt.title('Variance Explained Graph')
plt.xlabel('Singular Vector', fontsize=16)
plt.ylabel('Variance Explained', fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# plot images with different number of components
comps = [3648, 1, 5, 10, 15, 20]
plt.figure(figsize=(12, 6))

for i in range(len(comps)):
	low_rank = u[:, :comps[i]] @ np.diag(s[:comps[i]]) @ v[:comps[i], :]

	if(i == 0):
		plt.subplot(2, 3, i+1),
		plt.imshow(low_rank, cmap='gray'),
		plt.title(f'Actual Image with n_components = {comps[i]}')

	else:
		plt.subplot(2, 3, i+1),
		plt.imshow(low_rank, cmap='gray'),
		plt.title(f'n_components = {comps[i]}')
