# Data Preprocessing for Gold Price Prediction

This notebook performs data cleaning, feature engineering, and scaling on the consolidated dataset.

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
# Load the consolidated dataset
df = pd.read_csv("../data/processed/consolidated.csv", parse_dates=["date"])
df.head()

Unnamed: 0.1,Unnamed: 0,date,open,high,low,close,volume,dxy_close,oil_close,fedfunds,gs10,m2real,cpi,nfp,ppi
0,0,2015-01-02,1184.0,1194.5,1169.5,1186.0,138,,,0.11,1.88,5015.7,,,
1,1,2015-01-05,1180.300049,1206.900024,1180.099976,1203.900024,470,,,0.11,1.88,5015.7,,,
2,2,2015-01-06,1203.5,1220.0,1203.5,1219.300049,97,,,0.11,1.88,5015.7,,,
3,3,2015-01-07,1219.199951,1219.199951,1210.599976,1210.599976,29,,,0.11,1.88,5015.7,,,
4,4,2015-01-08,1207.0,1215.699951,1206.300049,1208.400024,92,,,0.11,1.88,5015.7,,,


## Data Exploration

Let's examine the dataset structure and check for missing values.

In [33]:
# Display basic info and summary statistics
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2552 entries, 0 to 2551
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  2552 non-null   int64         
 1   date        2552 non-null   datetime64[ns]
 2   open        2552 non-null   float64       
 3   high        2552 non-null   float64       
 4   low         2552 non-null   float64       
 5   close       2552 non-null   float64       
 6   volume      2552 non-null   int64         
 7   dxy_close   2300 non-null   float64       
 8   oil_close   2300 non-null   float64       
 9   fedfunds    2552 non-null   float64       
 10  gs10        2552 non-null   float64       
 11  m2real      2552 non-null   float64       
 12  cpi         2300 non-null   float64       
 13  nfp         2300 non-null   float64       
 14  ppi         2300 non-null   float64       
dtypes: datetime64[ns](1), float64(12), int64(2)
memory usage: 299.2 KB
None


In [None]:
# Drop innecessary columns

df = df.drop(columns=["Unnamed: 0"], index=1)
df.head()

Unnamed: 0,date,open,high,low,close,volume,dxy_close,oil_close,fedfunds,gs10,m2real,cpi,nfp,ppi
0,2015-01-02,1184.0,1194.5,1169.5,1186.0,138,,,0.11,1.88,5015.7,,,
2,2015-01-06,1203.5,1220.0,1203.5,1219.300049,97,,,0.11,1.88,5015.7,,,
3,2015-01-07,1219.199951,1219.199951,1210.599976,1210.599976,29,,,0.11,1.88,5015.7,,,
4,2015-01-08,1207.0,1215.699951,1206.300049,1208.400024,92,,,0.11,1.88,5015.7,,,
5,2015-01-09,1211.099976,1220.199951,1210.300049,1216.0,43,,,0.11,1.88,5015.7,,,


## Feature Engineering

### Adding technical indicators

In [35]:
# Calculate 21-period EMA and 200-period EMA on the 'close' price
df["ema_21"] = df["close"].ewm(span=21, adjust=False).mean()
df["ema_200"] = df["close"].ewm(span=200, adjust=False).mean()

# Calculate 14-period RSI
delta = df["close"].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window=14, min_periods=14).mean()
avg_loss = loss.rolling(window=14, min_periods=14).mean()
rs = avg_gain / avg_loss
df["rsi_14"] = 100 - (100 / (1 + rs))

# Display the new columns
df[["close", "ema_21", "ema_200", "rsi_14"]].tail()

Unnamed: 0,close,ema_21,ema_200,rsi_14
2547,2937.600098,2862.293305,2573.843503,72.283579
2548,2947.899902,2870.075723,2577.565458,71.142426
2549,2904.5,2873.205203,2580.818538,58.719342
2550,2916.800049,2877.168371,2584.161637,57.858132
2551,2883.199951,2877.716696,2587.137143,54.450256


In [36]:
# Quickly drop null values
df = df.dropna()

## We'll create a target variable representing the difference between the open price and the high or low price whichever is greater

In [37]:
# Define a function to compute the target for each row
def compute_target(row):
    diff_high = row["high"] - row["open"]
    diff_low = row["low"] - row["open"]
    return diff_high if abs(diff_high) >= abs(diff_low) else diff_low

# Apply the function to create the target column
# (Assumes your consolidated data includes columns "Open", "High", "Low")
df["target"] = df.apply(compute_target, axis=1)

df.head()

Unnamed: 0,date,open,high,low,close,volume,dxy_close,oil_close,fedfunds,gs10,m2real,cpi,nfp,ppi,ema_21,ema_200,rsi_14,target
252,2016-01-04,1063.400024,1082.5,1063.199951,1075.099976,143,98.870003,37.220001,0.34,2.09,5255.9,236.916,4.8,109.7,1072.147921,1131.009838,49.414808,19.099976
253,2016-01-05,1075.599976,1081.5,1075.300049,1078.400024,82,99.400002,36.419998,0.34,2.09,5255.9,236.916,4.8,109.7,1072.716294,1130.486357,54.727431,5.900024
254,2016-01-06,1081.599976,1093.699951,1081.599976,1091.900024,52,99.18,34.23,0.34,2.09,5255.9,236.916,4.8,109.7,1074.460269,1130.102414,59.259268,12.099976
255,2016-01-07,1091.599976,1109.400024,1091.599976,1107.699951,122,98.220001,33.75,0.34,2.09,5255.9,236.916,4.8,109.7,1077.482059,1129.879504,59.440556,17.800049
256,2016-01-08,1111.099976,1111.099976,1093.0,1097.800049,98,98.540001,33.549999,0.34,2.09,5255.9,236.916,4.8,109.7,1079.329149,1129.560305,66.785744,-18.099976


## Scaling Features

We will scale the numerical features (excluding the date and target) using StandardScaler.

In [38]:
from sklearn.preprocessing import StandardScaler

# Identify feature columns (excluding 'date' and 'target')
features = df.columns.difference(["date", "target"])

scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[features] = scaler.fit_transform(df[features])

df_scaled.head()

Unnamed: 0,date,open,high,low,close,volume,dxy_close,oil_close,fedfunds,gs10,m2real,cpi,nfp,ppi,ema_21,ema_200,rsi_14,target
252,2016-01-04,-1.492593,-1.455997,-1.483192,-1.463562,-0.175159,0.08388,-1.655352,-0.882534,-0.413267,-1.447047,-1.309252,0.104853,-1.252882,-1.478729,-1.352997,-0.220203,19.099976
253,2016-01-05,-1.462683,-1.458433,-1.453351,-1.455477,-0.177236,0.185721,-1.698365,-0.882534,-0.413267,-1.447047,-1.309252,0.104853,-1.252882,-1.477311,-1.354477,0.087538,5.900024
254,2016-01-06,-1.447974,-1.428712,-1.437814,-1.422404,-0.178257,0.143447,-1.816113,-0.882534,-0.413267,-1.447047,-1.309252,0.104853,-1.252882,-1.472958,-1.355563,0.350051,12.099976
255,2016-01-07,-1.423458,-1.390464,-1.413153,-1.383697,-0.175874,-0.041019,-1.841921,-0.882534,-0.413267,-1.447047,-1.309252,0.104853,-1.252882,-1.465417,-1.356193,0.360552,17.800049
256,2016-01-08,-1.375652,-1.386323,-1.4097,-1.40795,-0.176691,0.020469,-1.852674,-0.882534,-0.413267,-1.447047,-1.309252,0.104853,-1.252882,-1.460807,-1.357096,0.786032,-18.099976


## Save Cleaned Data

Save the cleaned and scaled data to a new CSV file for future modeling.

In [39]:
# Save the cleaned and scaled data to a CSV file
df_scaled.to_csv("../data/processed/clean_dataset.csv", index=False)
print("Cleaned data saved to data/processed/clean_dataset.csv")

Cleaned data saved to data/processed/clean_dataset.csv
