# Capstone: Portfolio Optimization

In [1]:
#Base
import numpy as np
import pandas as pd
from datetime import datetime

#Visualization
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

#Data Optimization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

#Analysis
import statsmodels.api as sm
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy.optimize import minimize
import cvxpy as cp


# 1. Data Wrangling

### 1.1 Upload the csv files

In [2]:
df=pd.read_csv('forecast2.csv')

### 1.2 Data Exploration & Preparation

In [3]:
#We can see all the columns with the close suffix that is the closing price
#and volume that is the number of transactions.
df.head(1)

Unnamed: 0,date,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
0,2018-02-07,51.400002,39.884998,109.93,94.220001,58.669998,155.149994,192.339996,85.349998,42.189999,...,19.49,177.320007,34.52,42.459999,76.940002,58.299999,31.18,80.129997,117.262138,54.02


In [4]:
df = df.iloc[:, 1:477].copy()
df

Unnamed: 0,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,ADP_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
0,51.400002,39.884998,109.930000,94.220001,58.669998,155.149994,192.339996,85.349998,42.189999,113.559998,...,19.490000,177.320007,34.520000,42.459999,76.940002,58.299999,31.180000,80.129997,117.262138,54.020000
1,48.599998,38.787498,109.629997,91.550003,56.270000,150.509995,185.160004,82.680000,41.349998,108.250000,...,18.809999,169.279999,33.599998,42.439999,76.070000,56.849998,29.799999,76.300003,112.165047,50.709999
2,48.360001,39.102501,109.139999,89.190002,57.169998,153.839996,187.990005,83.930000,41.490002,111.430000,...,19.360001,166.220001,33.790001,43.349998,75.779999,57.180000,29.610001,79.309998,113.000000,51.639999
3,48.360001,39.102501,109.139999,89.190002,57.169998,153.839996,187.990005,83.930000,41.490002,111.430000,...,19.360001,166.220001,33.790001,43.349998,75.779999,57.180000,29.610001,79.309998,113.000000,51.639999
4,48.360001,39.102501,109.139999,89.190002,57.169998,153.839996,187.990005,83.930000,41.490002,111.430000,...,19.360001,166.220001,33.790001,43.349998,75.779999,57.180000,29.610001,79.309998,113.000000,51.639999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,36.606709,55.625759,171.802775,125.107555,85.863982,161.440507,384.045885,104.136902,54.546435,166.274533,...,14.909949,59.063899,29.248554,54.650627,71.918702,37.296844,20.084826,105.123942,130.132802,42.645541
1092,36.311619,56.565315,169.382854,122.768791,87.457993,162.477834,391.295885,103.558777,54.337610,167.938595,...,14.241980,70.620767,28.889533,54.551800,71.165960,34.369645,20.209826,105.336123,129.762431,43.004916
1093,35.528645,56.684545,172.253947,123.787106,87.380090,160.523640,386.045885,102.621277,54.567809,167.696408,...,14.529089,57.466724,28.390476,55.437253,70.944341,35.491756,20.366076,105.538980,131.180349,40.598666
1094,35.633666,56.076777,173.402385,124.929447,86.274562,161.280659,380.858385,102.965027,54.478103,166.391720,...,14.847449,52.176770,28.441511,55.668538,70.636867,37.221244,19.959826,105.655366,130.510158,40.426791


# 2. Analysis of correlations

In [5]:
correlation_matrix = df.corr()

# Find the 30 stocks with the minimal correlation
min_correlation_stocks = correlation_matrix.min().nsmallest(30).index

# Subset the original DataFrame with the selected stocks
selected_stocks = df[min_correlation_stocks]

In [6]:
expected_returns = df.mean()

In [7]:
weights = cp.Variable(len(df.columns))

In [8]:
expected_returns = df.mean()
covariance_matrix = df.cov()

In [9]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()
returns_df

Unnamed: 0,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,ADP_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
1,-0.054475,-0.027517,-0.002729,-0.028338,-0.040907,-0.029907,-0.037330,-0.031283,-0.019910,-0.046759,...,-0.034890,-0.045342,-0.026651,-0.000471,-0.011308,-0.024871,-0.044259,-0.047797,-0.043467,-0.061274
2,-0.004938,0.008121,-0.004470,-0.025778,0.015994,0.022125,0.015284,0.015119,0.003386,0.029376,...,0.029240,-0.018077,0.005655,0.021442,-0.003812,0.005805,-0.006376,0.039449,0.007444,0.018340
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.035773,0.040279,0.003115,0.002915,0.004198,0.007215,0.022129,0.013344,-0.005303,0.001974,...,0.021178,-0.019853,0.016869,0.009458,0.008445,0.003498,0.011483,-0.015635,0.001117,0.008133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,0.001825,0.002809,-0.019108,-0.018934,0.015390,-0.002729,0.025022,0.004673,-0.002869,0.010301,...,-0.030973,0.262406,0.009413,-0.002109,0.022124,-0.055856,0.012604,0.007260,0.001727,0.011864
1092,-0.008061,0.016891,-0.014085,-0.018694,0.018564,0.006425,0.018878,-0.005552,-0.003828,0.010008,...,-0.044800,0.195667,-0.012275,-0.001808,-0.010467,-0.078484,0.006224,0.002018,-0.002846,0.008427
1093,-0.021563,0.002108,0.016950,0.008295,-0.000891,-0.012027,-0.013417,-0.009053,0.004236,-0.001442,...,0.020159,-0.186263,-0.017275,0.016231,-0.003114,0.032648,0.007731,0.001926,0.010927,-0.055953
1094,0.002956,-0.010722,0.006667,0.009228,-0.012652,0.004716,-0.013438,0.003350,-0.001644,-0.007780,...,0.021912,-0.092052,0.001798,0.004172,-0.004334,0.048729,-0.019947,0.001103,-0.005109,-0.004234


In [10]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()

# Calculate the expected returns and covariance matrix
expected_returns = returns_df.mean()
covariance_matrix = returns_df.cov()

# Define the objective function to maximize the Sharpe ratio
def objective_function(weights):
    returns = np.dot(weights, expected_returns)
    volatility = np.sqrt(np.dot(weights, np.dot(covariance_matrix, weights.T)))
    sharpe_ratio = returns / volatility
    return -sharpe_ratio  # Negate for maximization

# Define the constraints (e.g., weights sum up to 1)
constraints = [{'type': 'eq', 'fun': lambda x: np.sum(x) - 1},  # Sum of weights equal to 1
               {'type': 'ineq', 'fun': lambda x: x}]  # Non-negative weights

# Define the bounds for the weights (0 <= weight <= 1)
bounds = [(0, 1)] * len(df.columns)

# Set an initial guess for the weights
initial_guess = [1 / len(df.columns)] * len(df.columns)

# Solve the optimization problem
result = minimize(objective_function, initial_guess, method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = result.x

# Create a DataFrame with the optimal portfolio weights
portfolio_df = pd.DataFrame({'Stock': df.columns, 'Weight': optimal_weights})

# Sort the DataFrame by weights in descending order
portfolio_df = portfolio_df.sort_values('Weight', ascending=False)

# 3. Stock Selection

In [11]:
# Normalize the weights of the top 30 stocks so that they sum up to 1
selected_stocks = portfolio_df[:30]
normalized_weights = selected_stocks['Weight'] / np.sum(selected_stocks['Weight'])

# Assign the normalized weights back to the selected_weights DataFrame
selected_stocks['Normalized_Weight'] = normalized_weights

selected_stocks = selected_stocks.drop(columns=['Weight'])

selected_stocks 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_stocks['Normalized_Weight'] = normalized_weights


Unnamed: 0,Stock,Normalized_Weight
258,NEE_close,0.287054
218,LLY_close,0.099956
137,EXR_close,0.075154
343,TDG_close,0.057905
79,CMG_close,0.04614
179,HRL_close,0.044017
128,EQT_close,0.040945
82,CNC_close,0.037762
183,HUM_close,0.036838
356,UAA_close,0.032718


In [12]:
selected_stocks.to_csv('selected_stocks(forecast)2.csv',index=False)

# 4. Repeating the process for historical data

In [13]:
df=pd.read_csv('historical2.csv')

In [14]:
df = df.iloc[:, 1:477].copy()

In [15]:
correlation_matrix = df.corr()

# Find the 30 stocks with the minimal correlation
min_correlation_stocks = correlation_matrix.min().nsmallest(30).index

# Subset the original DataFrame with the selected stocks
selected_stocks = df[min_correlation_stocks]

expected_returns = df.mean()

weights = cp.Variable(len(df.columns))

expected_returns = df.mean()
covariance_matrix = df.cov()

In [16]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()

# Calculate the expected returns and covariance matrix
expected_returns = returns_df.mean()
covariance_matrix = returns_df.cov()

# Define the objective function to maximize the Sharpe ratio
def objective_function(weights):
    returns = np.dot(weights, expected_returns)
    volatility = np.sqrt(np.dot(weights, np.dot(covariance_matrix, weights.T)))
    sharpe_ratio = returns / volatility
    return -sharpe_ratio  # Negate for maximization

# Define the constraints (e.g., weights sum up to 1)
constraints = [{'type': 'eq', 'fun': lambda x: np.sum(x) - 1},  # Sum of weights equal to 1
               {'type': 'ineq', 'fun': lambda x: x}]  # Non-negative weights

# Define the bounds for the weights (0 <= weight <= 1)
bounds = [(0, 1)] * len(df.columns)

# Set an initial guess for the weights
initial_guess = [1 / len(df.columns)] * len(df.columns)

# Solve the optimization problem
result = minimize(objective_function, initial_guess, method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = result.x

# Create a DataFrame with the optimal portfolio weights
portfolio_df = pd.DataFrame({'Stock': df.columns, 'Weight': optimal_weights})

# Sort the DataFrame by weights in descending order
portfolio_df = portfolio_df.sort_values('Weight', ascending=False)

In [17]:
# Normalize the weights of the top 30 stocks so that they sum up to 1
selected_stocks = portfolio_df[:30]
normalized_weights = selected_stocks['Weight'] / np.sum(selected_stocks['Weight'])

# Assign the normalized weights back to the selected_weights DataFrame
selected_stocks['Normalized_Weight'] = normalized_weights

selected_stocks = selected_stocks.drop(columns=['Weight'])

selected_stocks 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_stocks['Normalized_Weight'] = normalized_weights


Unnamed: 0,Stock,Normalized_Weight
258,NEE_close,0.2162369
132,ETR_close,0.2030058
79,CMG_close,0.1332586
218,LLY_close,0.09563885
13,AES_close,0.07467862
343,TDG_close,0.07199801
25,AMD_close,0.05299497
46,AZO_close,0.02854427
87,COST_close,0.02707889
30,AMT_close,0.02401615


In [18]:
selected_stocks.to_csv('selected_stocks(historical)2.csv',index=False)