# Capstone: Portfolio Optimization

In [1]:
#Base
import numpy as np
import pandas as pd
from datetime import datetime

#Visualization
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

#Data Optimization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

#Analysis
import statsmodels.api as sm
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy.optimize import minimize
import cvxpy as cp


# 1. Data Wrangling

### 1.1 Upload the csv files

In [2]:
df=pd.read_csv('forecast2.csv')

### 1.2 Data Exploration & Preparation

In [3]:
#We can see all the columns with the close suffix that is the closing price
#and volume that is the number of transactions.
df.head(1)

Unnamed: 0,date,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
0,2020-02-07,28.379999,80.0075,132.929993,91.940002,87.870003,211.580002,366.089996,112.269997,45.709999,...,27.65,126.910004,28.58,68.709999,61.470001,57.099998,37.169998,102.040001,151.834946,46.23


In [4]:
df = df.iloc[:, 1:477].copy()
df

Unnamed: 0,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,ADP_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
0,28.379999,80.007500,132.929993,91.940002,87.870003,211.580002,366.089996,112.269997,45.709999,179.100006,...,27.650000,126.910004,28.580000,68.709999,61.470001,57.099998,37.169998,102.040001,151.834946,46.230000
1,28.379999,80.007500,132.929993,91.940002,87.870003,211.580002,366.089996,112.269997,45.709999,179.100006,...,27.650000,126.910004,28.580000,68.709999,61.470001,57.099998,37.169998,102.040001,151.834946,46.230000
2,28.379999,80.007500,132.929993,91.940002,87.870003,211.580002,366.089996,112.269997,45.709999,179.100006,...,27.650000,126.910004,28.580000,68.709999,61.470001,57.099998,37.169998,102.040001,151.834946,46.230000
3,28.790001,80.387497,130.679993,93.599998,88.300003,212.550003,370.000000,112.900002,45.680000,179.360001,...,28.299999,127.879997,28.650000,69.059998,59.959999,57.450001,37.689999,102.870003,153.611649,45.650002
4,29.840000,79.902496,132.110001,94.570000,89.169998,212.139999,369.279999,116.879997,46.119999,177.970001,...,28.240000,131.990005,28.900000,68.959999,60.529999,58.000000,37.509998,104.050003,154.650482,46.279999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1092,-48.992513,184.641510,183.486600,124.690785,167.656430,405.435459,765.407264,167.719250,40.495848,204.319346,...,14.648993,-40.046027,19.419445,89.536712,5.075815,33.207364,9.289883,156.182208,139.992520,-2.621450
1093,-47.555013,183.117147,182.736600,124.490467,167.265900,400.894612,752.235774,171.030807,40.620848,205.569346,...,14.943646,-32.171027,20.036632,88.929121,6.888315,34.130341,9.785976,155.119708,141.117520,-0.746450
1094,-47.023763,181.311186,180.736600,123.430497,166.349918,400.808537,748.806798,171.022703,39.995848,205.569346,...,14.961584,-27.921027,19.903820,89.191209,7.107065,34.328326,10.188320,155.619708,141.242520,0.441050
1095,-47.180013,181.268741,180.736600,123.438262,166.356637,400.664105,749.465716,170.961791,39.933348,205.444346,...,15.010935,-27.921027,19.919445,89.291420,6.857065,34.329779,10.164883,155.557208,141.492520,0.316050


# 2. Analysis of correlations

In [5]:
correlation_matrix = df.corr()

# Find the 30 stocks with the minimal correlation
min_correlation_stocks = correlation_matrix.min().nsmallest(30).index

# Subset the original DataFrame with the selected stocks
selected_stocks = df[min_correlation_stocks]

In [6]:
expected_returns = df.mean()

In [7]:
weights = cp.Variable(len(df.columns))

In [8]:
expected_returns = df.mean()
covariance_matrix = df.cov()

In [9]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()
returns_df

Unnamed: 0,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,ADP_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.014447,0.004750,-0.016926,0.018055,0.004894,0.004585,0.010680,0.005612,-0.000656,0.001452,...,0.023508,0.007643,0.002449,0.005094,-0.024565,0.006130,0.013990,0.008134,0.011702,-0.012546
4,0.036471,-0.006033,0.010943,0.010363,0.009853,-0.001929,-0.001946,0.035252,0.009632,-0.007750,...,-0.002120,0.032140,0.008726,-0.001448,0.009506,0.009574,-0.004776,0.011471,0.006763,0.013801
5,0.021113,0.023748,0.016729,0.012795,-0.004261,-0.000707,0.013567,0.016513,-0.011058,0.000337,...,-0.074717,0.036594,0.029412,0.001015,0.012225,0.000172,0.018129,0.012398,-0.003704,-0.003673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1092,0.034306,0.008654,0.020864,0.001591,0.003177,0.005858,0.002005,-0.010090,-0.001541,-0.002441,...,0.006145,0.274498,-0.018557,0.004073,-0.220702,-0.009784,-0.036073,-0.007546,0.002686,1.806841
1093,-0.029341,-0.008256,-0.004087,-0.001607,-0.002329,-0.011200,-0.017208,0.019745,0.003087,0.006118,...,0.020114,-0.196649,0.031782,-0.006786,0.357085,0.027794,0.053402,-0.006803,0.008036,-0.715253
1094,-0.011171,-0.009862,-0.010945,-0.008514,-0.005476,-0.000215,-0.004558,-0.000047,-0.015386,0.000000,...,0.001200,-0.132106,-0.006628,0.002947,0.031757,0.005801,0.041114,0.003223,0.000886,-1.590863
1095,0.003323,-0.000234,0.000000,0.000063,0.000040,-0.000360,0.000880,-0.000356,-0.001563,-0.000608,...,0.003299,0.000000,0.000785,0.001124,-0.035176,0.000042,-0.002300,-0.000402,0.001770,-0.283415


In [10]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()

# Calculate the expected returns and covariance matrix
expected_returns = returns_df.mean()
covariance_matrix = returns_df.cov()

# Define the objective function to maximize the Sharpe ratio
def objective_function(weights):
    returns = np.dot(weights, expected_returns)
    volatility = np.sqrt(np.dot(weights, np.dot(covariance_matrix, weights.T)))
    sharpe_ratio = returns / volatility
    return -sharpe_ratio  # Negate for maximization

# Define the constraints (e.g., weights sum up to 1)
constraints = [{'type': 'eq', 'fun': lambda x: np.sum(x) - 1},  # Sum of weights equal to 1
               {'type': 'ineq', 'fun': lambda x: x}]  # Non-negative weights

# Define the bounds for the weights (0 <= weight <= 1)
bounds = [(0, 1)] * len(df.columns)

# Set an initial guess for the weights
initial_guess = [1 / len(df.columns)] * len(df.columns)

# Solve the optimization problem
result = minimize(objective_function, initial_guess, method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = result.x

# Create a DataFrame with the optimal portfolio weights
portfolio_df = pd.DataFrame({'Stock': df.columns, 'Weight': optimal_weights})

# Sort the DataFrame by weights in descending order
portfolio_df = portfolio_df.sort_values('Weight', ascending=False)

# 3. Stock Selection

In [11]:
# Normalize the weights of the top 30 stocks so that they sum up to 1
selected_stocks = portfolio_df[:30]
normalized_weights = selected_stocks['Weight'] / np.sum(selected_stocks['Weight'])

# Assign the normalized weights back to the selected_weights DataFrame
selected_stocks['Normalized_Weight'] = normalized_weights

selected_stocks = selected_stocks.drop(columns=['Weight'])

selected_stocks 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_stocks['Normalized_Weight'] = normalized_weights


Unnamed: 0,Stock,Normalized_Weight
103,DHR_close,0.164344
351,TSCO_close,0.141795
211,KR_close,0.116498
259,NEM_close,0.076839
317,RRC_close,0.066167
74,CLX_close,0.053239
306,REGN_close,0.049015
31,AMZN_close,0.047142
368,VLO_close,0.037066
39,ATVI_close,0.034219


In [12]:
selected_stocks.to_csv('selected_stocks(forecast)2.csv',index=False)

# 4. Repeating the process for historical data

In [13]:
df=pd.read_csv('historical2.csv')

In [14]:
df = df.iloc[:, 1:477].copy()

In [15]:
correlation_matrix = df.corr()

# Find the 30 stocks with the minimal correlation
min_correlation_stocks = correlation_matrix.min().nsmallest(30).index

# Subset the original DataFrame with the selected stocks
selected_stocks = df[min_correlation_stocks]

expected_returns = df.mean()

weights = cp.Variable(len(df.columns))

expected_returns = df.mean()
covariance_matrix = df.cov()

In [16]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()

# Calculate the expected returns and covariance matrix
expected_returns = returns_df.mean()
covariance_matrix = returns_df.cov()

# Define the objective function to maximize the Sharpe ratio
def objective_function(weights):
    returns = np.dot(weights, expected_returns)
    volatility = np.sqrt(np.dot(weights, np.dot(covariance_matrix, weights.T)))
    sharpe_ratio = returns / volatility
    return -sharpe_ratio  # Negate for maximization

# Define the constraints (e.g., weights sum up to 1)
constraints = [{'type': 'eq', 'fun': lambda x: np.sum(x) - 1},  # Sum of weights equal to 1
               {'type': 'ineq', 'fun': lambda x: x}]  # Non-negative weights

# Define the bounds for the weights (0 <= weight <= 1)
bounds = [(0, 1)] * len(df.columns)

# Set an initial guess for the weights
initial_guess = [1 / len(df.columns)] * len(df.columns)

# Solve the optimization problem
result = minimize(objective_function, initial_guess, method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = result.x

# Create a DataFrame with the optimal portfolio weights
portfolio_df = pd.DataFrame({'Stock': df.columns, 'Weight': optimal_weights})

# Sort the DataFrame by weights in descending order
portfolio_df = portfolio_df.sort_values('Weight', ascending=False)

In [17]:
# Normalize the weights of the top 30 stocks so that they sum up to 1
selected_stocks = portfolio_df[:30]
normalized_weights = selected_stocks['Weight'] / np.sum(selected_stocks['Weight'])

# Assign the normalized weights back to the selected_weights DataFrame
selected_stocks['Normalized_Weight'] = normalized_weights

selected_stocks = selected_stocks.drop(columns=['Weight'])

selected_stocks 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_stocks['Normalized_Weight'] = normalized_weights


Unnamed: 0,Stock,Normalized_Weight
270,NVDA_close,0.1869732
211,KR_close,0.1665118
351,TSCO_close,0.1587889
137,EXR_close,0.1440623
317,RRC_close,0.1258886
128,EQT_close,0.06223997
259,NEM_close,0.05932953
310,RHI_close,0.03628181
364,UPS_close,0.0298952
302,PWR_close,0.01944824


In [18]:
selected_stocks.to_csv('selected_stocks(historical)2.csv',index=False)