# Capstone: Portfolio Optimization

In [1]:
#Base
import numpy as np
import pandas as pd
from datetime import datetime

#Visualization
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

#Data Optimization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

#Analysis
import statsmodels.api as sm
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy.optimize import minimize
import cvxpy as cp


# 1. Data Wrangling

### 1.1 Upload the csv files

In [2]:
df=pd.read_csv('forecast2.csv')

### 1.2 Data Exploration & Preparation

In [3]:
#We can see all the columns with the close suffix that is the closing price
#and volume that is the number of transactions.
df.head(1)

Unnamed: 0,date,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
0,2019-02-07,36.169998,42.735001,163.130005,85.800003,72.739998,155.179993,253.740005,100.629997,41.400002,...,18.32,126.879997,25.91,53.130001,74.68,42.860001,28.98,94.610001,114.844658,49.299999


In [4]:
df = df.iloc[:, 1:477].copy()
df

Unnamed: 0,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,ADP_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
0,36.169998,42.735001,163.130005,85.800003,72.739998,155.179993,253.740005,100.629997,41.400002,145.679993,...,18.320000,126.879997,25.910000,53.130001,74.680000,42.860001,28.980000,94.610001,114.844658,49.299999
1,36.000000,42.602501,162.410004,84.309998,73.330002,155.589996,257.000000,100.730003,41.759998,146.809998,...,18.010000,125.209999,25.650000,53.480000,73.980003,42.619999,29.090000,94.489998,117.067963,49.040001
2,36.000000,42.602501,162.410004,84.309998,73.330002,155.589996,257.000000,100.730003,41.759998,146.809998,...,18.010000,125.209999,25.650000,53.480000,73.980003,42.619999,29.090000,94.489998,117.067963,49.040001
3,36.000000,42.602501,162.410004,84.309998,73.330002,155.589996,257.000000,100.730003,41.759998,146.809998,...,18.010000,125.209999,25.650000,53.480000,73.980003,42.619999,29.090000,94.489998,117.067963,49.040001
4,36.590000,42.357498,165.130005,83.639999,73.540001,155.320007,258.390015,101.750000,42.009998,147.509995,...,17.860001,125.000000,25.400000,53.570000,74.099998,42.970001,29.290001,93.260002,118.669907,49.419998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1092,-2.000924,224.661755,172.335769,120.070119,138.285730,324.900496,735.012158,164.097076,56.474801,165.663786,...,20.063609,126.856384,42.087933,71.864541,18.702058,57.632920,8.218858,111.353416,157.705169,32.056382
1093,-1.617000,227.638314,170.525305,118.335744,138.598230,323.058211,740.762158,162.347076,56.141891,164.413786,...,19.797984,122.620883,41.587933,71.481173,18.321002,55.445420,8.554918,112.665916,156.756392,33.150132
1094,-1.595359,228.003414,170.200821,119.507619,138.973230,325.383895,744.574658,165.253326,56.323497,167.163786,...,19.891734,129.595414,42.275433,71.502695,17.756731,56.757920,8.296078,111.103416,157.426440,32.431382
1095,-1.565266,227.810541,170.182376,119.382619,139.035730,325.550887,744.387158,165.315826,56.333604,167.163786,...,19.844859,130.086948,42.306683,71.473563,17.850547,56.882920,8.264924,111.165916,157.351639,32.525132


# 2. Analysis of correlations

In [5]:
correlation_matrix = df.corr()

# Find the 30 stocks with the minimal correlation
min_correlation_stocks = correlation_matrix.min().nsmallest(30).index

# Subset the original DataFrame with the selected stocks
selected_stocks = df[min_correlation_stocks]

In [6]:
expected_returns = df.mean()

In [7]:
weights = cp.Variable(len(df.columns))

In [8]:
expected_returns = df.mean()
covariance_matrix = df.cov()

In [9]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()
returns_df

Unnamed: 0,AAL_close,AAPL_close,AAP_close,ABC_close,ABT_close,ACN_close,ADBE_close,ADI_close,ADM_close,ADP_close,...,WU_close,WYNN_close,WY_close,XEL_close,XOM_close,XRAY_close,XRX_close,YUM_close,ZBH_close,ZION_close
1,-0.004700,-0.003100,-0.004414,-0.017366,0.008111,0.002642,0.012848,0.000994,0.008696,0.007757,...,-0.016921,-0.013162,-0.010035,0.006588,-0.009373,-0.005600,0.003796,-0.001268,0.019359,-0.005274
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.016389,-0.005751,0.016748,-0.007947,0.002864,-0.001735,0.005409,0.010126,0.005987,0.004768,...,-0.008329,-0.001677,-0.009747,0.001683,0.001622,0.008212,0.006875,-0.013017,0.013684,0.007749
5,-0.022957,0.008617,0.016714,0.025705,0.020669,0.008434,0.011533,0.022703,0.007855,0.013491,...,0.007279,0.009040,-0.002756,-0.001680,0.017544,0.000233,0.027313,0.008471,0.014072,0.009510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1092,-0.448948,-0.014910,-0.036674,-0.006978,-0.006734,0.000922,-0.005665,0.025986,0.005002,0.006837,...,-0.018348,0.090993,0.021619,-0.003633,0.095502,0.017658,-0.025437,0.017998,-0.011268,0.037419
1093,-0.191873,0.013249,-0.010505,-0.014445,0.002260,-0.005670,0.007823,-0.010664,-0.005895,-0.007545,...,-0.013239,-0.033388,-0.011880,-0.005335,-0.020375,-0.037956,0.040889,0.011787,-0.006016,0.034120
1094,-0.013383,0.001604,-0.001903,0.009903,0.002706,0.007199,0.005147,0.017901,0.003235,0.016726,...,0.004735,0.056879,0.016531,0.000301,-0.030799,0.023672,-0.030256,-0.013868,0.004274,-0.021682
1095,-0.018863,-0.000846,-0.000108,-0.001046,0.000450,0.000513,-0.000252,0.000378,0.000179,0.000000,...,-0.002357,0.003793,0.000739,-0.000407,0.005283,0.002202,-0.003755,0.000563,-0.000475,0.002891


In [10]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()

# Calculate the expected returns and covariance matrix
expected_returns = returns_df.mean()
covariance_matrix = returns_df.cov()

# Define the objective function to maximize the Sharpe ratio
def objective_function(weights):
    returns = np.dot(weights, expected_returns)
    volatility = np.sqrt(np.dot(weights, np.dot(covariance_matrix, weights.T)))
    sharpe_ratio = returns / volatility
    return -sharpe_ratio  # Negate for maximization

# Define the constraints (e.g., weights sum up to 1)
constraints = [{'type': 'eq', 'fun': lambda x: np.sum(x) - 1},  # Sum of weights equal to 1
               {'type': 'ineq', 'fun': lambda x: x}]  # Non-negative weights

# Define the bounds for the weights (0 <= weight <= 1)
bounds = [(0, 1)] * len(df.columns)

# Set an initial guess for the weights
initial_guess = [1 / len(df.columns)] * len(df.columns)

# Solve the optimization problem
result = minimize(objective_function, initial_guess, method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = result.x

# Create a DataFrame with the optimal portfolio weights
portfolio_df = pd.DataFrame({'Stock': df.columns, 'Weight': optimal_weights})

# Sort the DataFrame by weights in descending order
portfolio_df = portfolio_df.sort_values('Weight', ascending=False)

# 3. Stock Selection

In [11]:
# Normalize the weights of the top 30 stocks so that they sum up to 1
selected_stocks = portfolio_df[:30]
normalized_weights = selected_stocks['Weight'] / np.sum(selected_stocks['Weight'])

# Assign the normalized weights back to the selected_weights DataFrame
selected_stocks['Normalized_Weight'] = normalized_weights

selected_stocks = selected_stocks.drop(columns=['Weight'])

selected_stocks 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_stocks['Normalized_Weight'] = normalized_weights


Unnamed: 0,Stock,Normalized_Weight
345,TGT_close,0.149501
259,NEM_close,0.139757
74,CLX_close,0.083232
211,KR_close,0.078761
1,AAPL_close,0.070582
31,AMZN_close,0.065027
79,CMG_close,0.063833
186,IDXX_close,0.058513
103,DHR_close,0.053227
25,AMD_close,0.043596


In [12]:
selected_stocks.to_csv('selected_stocks(forecast)2.csv',index=False)

# 4. Repeating the process for historical data

In [13]:
df=pd.read_csv('historical2.csv')

In [14]:
df = df.iloc[:, 1:477].copy()

In [15]:
correlation_matrix = df.corr()

# Find the 30 stocks with the minimal correlation
min_correlation_stocks = correlation_matrix.min().nsmallest(30).index

# Subset the original DataFrame with the selected stocks
selected_stocks = df[min_correlation_stocks]

expected_returns = df.mean()

weights = cp.Variable(len(df.columns))

expected_returns = df.mean()
covariance_matrix = df.cov()

In [16]:
# Calculate the daily returns for each stock
returns_df = df.pct_change().dropna()

# Calculate the expected returns and covariance matrix
expected_returns = returns_df.mean()
covariance_matrix = returns_df.cov()

# Define the objective function to maximize the Sharpe ratio
def objective_function(weights):
    returns = np.dot(weights, expected_returns)
    volatility = np.sqrt(np.dot(weights, np.dot(covariance_matrix, weights.T)))
    sharpe_ratio = returns / volatility
    return -sharpe_ratio  # Negate for maximization

# Define the constraints (e.g., weights sum up to 1)
constraints = [{'type': 'eq', 'fun': lambda x: np.sum(x) - 1},  # Sum of weights equal to 1
               {'type': 'ineq', 'fun': lambda x: x}]  # Non-negative weights

# Define the bounds for the weights (0 <= weight <= 1)
bounds = [(0, 1)] * len(df.columns)

# Set an initial guess for the weights
initial_guess = [1 / len(df.columns)] * len(df.columns)

# Solve the optimization problem
result = minimize(objective_function, initial_guess, method='SLSQP', bounds=bounds, constraints=constraints)
optimal_weights = result.x

# Create a DataFrame with the optimal portfolio weights
portfolio_df = pd.DataFrame({'Stock': df.columns, 'Weight': optimal_weights})

# Sort the DataFrame by weights in descending order
portfolio_df = portfolio_df.sort_values('Weight', ascending=False)

In [17]:
# Normalize the weights of the top 30 stocks so that they sum up to 1
selected_stocks = portfolio_df[:30]
normalized_weights = selected_stocks['Weight'] / np.sum(selected_stocks['Weight'])

# Assign the normalized weights back to the selected_weights DataFrame
selected_stocks['Normalized_Weight'] = normalized_weights

selected_stocks = selected_stocks.drop(columns=['Weight'])

selected_stocks 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_stocks['Normalized_Weight'] = normalized_weights


Unnamed: 0,Stock,Normalized_Weight
345,TGT_close,0.2496622
1,AAPL_close,0.1418873
259,NEM_close,0.1391333
79,CMG_close,0.1195979
39,ATVI_close,0.09009057
186,IDXX_close,0.05272973
304,QCOM_close,0.04494595
302,PWR_close,0.04208571
103,DHR_close,0.0389456
25,AMD_close,0.03586026


In [18]:
selected_stocks.to_csv('selected_stocks(historical)2.csv',index=False)