In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ydata_profiling as pp
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

df=pd.read_csv("netflix.csv")

df.head()

df.shape # dataset has 5044 rows and 9 columns

df['Date'] = pd.to_datetime(df['Date'])
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

##Remove the 'Date' column since it's no longer needed after extracting the year, month, and day information.
df.drop('Date',axis=1,inplace=True) 

#Checking for the null values
df.isnull().sum()

#Count the number of duplicate rows to check data redundancy. A result of 0 indicates no duplicate records.
df.duplicated().sum()

df.info()#Display information about the dataset, including column data types and memory usage.

#Generate descriptive statistics such as mean, standard deviation, min, and max values for each numerical column, providing insights into the data distribution.
df.describe()

#Compute the correlation matrix to quantify the relationships between different numerical features.
corr = df.corr()

import missingno as msno 
msno.bar(df)

# Find the number of unique values in each column
unique_values = df.nunique()

# Display the unique value counts
unique_values


# Create a figure with a specified size
plt.figure(figsize=(12, 8))  # Width = 12 inches, Height = 8 inches

# Generate the heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm', cbar=True)

# Show the plot
plt.show()

#Create pair plots to visualize pairwise relationships and distributions for all columns in the dataset. Useful for spotting trends and patterns.
sns.pairplot(df)

for i in df.columns:
    plt.figure(figsize=(10,6))
    sns.distplot(df[i])
    plt.title(i)
    plt.show()

pp.ProfileReport(df)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Feature Engineering
df['Price_Diff'] = df['High'] - df['Low']
df['Close_MA_5'] = df['Close'].rolling(window=5).mean()
df['Close_Lag_1'] = df['Close'].shift(1)
df['Volume_MA_5'] = df['Volume'].rolling(window=5).mean()
df['Pct_Change'] = (df['Close'] - df['Open']) / df['Open']

df['Price_Diff'],df['Close_MA_5'],df['Close_Lag_1'],df['Volume_MA_5'],df['Pct_Change'] 

df.isnull().sum()

# Drop NaN values
df = df.dropna()

# Define features and target
X = df[['Open', 'High', 'Low', 'Volume', 'Price_Diff', 'Close_MA_5', 'Close_Lag_1', 'Volume_MA_5', 'Pct_Change']]
y = df['Close']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Ridge Regression
params = {'alpha': [0.1, 1.0, 10.0, 100.0]}
ridge_grid = GridSearchCV(Ridge(), params, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)

# Best alpha and corresponding Ridge model
best_alpha = ridge_grid.best_params_['alpha']
print('Best Alpha for Ridge: {best_alpha}')

# Train Ridge model with the best alpha
ridge_model = Ridge(alpha=best_alpha)
ridge_model.fit(X_train, y_train)

# Predict on the test data
y_pred_ridge = ridge_model.predict(X_test)

# Calculate the mean squared error
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f'Mean Squared Error (Ridge Regression): {mse_ridge}')

comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_ridge})

# Set the index to be the range of test samples
comparison_df.reset_index(drop=True, inplace=True)

# Set up the bar width and positions
bar_width = 0.35
x = np.arange(len(comparison_df))

# Create the bar chart
plt.figure(figsize=(12, 6))
plt.bar(x - bar_width/2, comparison_df['Actual'], width=bar_width, label='Actual', color='blue')
plt.bar(x + bar_width/2, comparison_df['Predicted'], width=bar_width, label='Predicted', color='orange')
plt.xlabel('Test Sample Index')
plt.ylabel('Prices')
plt.title('Comparison of Actual vs Predicted Prices')
plt.legend()
plt.show()

# Calculate residuals
residuals = y_test - y_pred_ridge

# Create a histogram of residuals
plt.figure(figsize=(10, 5))
plt.hist(residuals, bins=20, color='blue', edgecolor='black')
plt.axvline(x=0, color='red', linestyle='--', label='Zero Residual')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals')
plt.legend()
plt.show()

# Assuming 'dates' is the index of your DataFrame
plt.figure(figsize=(12, 6))
plt.plot(df.index[-len(y_test):], y_test, label='Actual Prices', color='blue', marker='o')
plt.plot(df.index[-len(y_test):], y_pred_ridge, label='Predicted Prices', color='orange', marker='o')
plt.xlabel('Date')
plt.ylabel('Prices')
plt.title('Actual vs Predicted Prices Over Time')
plt.legend()
plt.show()
