In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [73]:
# read csv file
data = pd.read_csv("./files/data_clean.csv")
# check data shape
display(data.head())
data.shape

Unnamed: 0,date,year,gold,silver,platinum,coal_monthly,n_gas,oil,euro,gb_pound,dowjones_industrial,dowjones_transport,nasdaq,snp,southern_copper,rio_tinto,royal_gold,newmont,agnico_mines
0,1997-10-01,1997,333.5,5.16,1.46,37.15,3.12,21.05,1.1,1.62,8015.5,3203.6,1102.06,955.41,3.04,16.25,8.75,44.44,9.94
1,1997-10-02,1997,331.2,5.16,1.48,37.15,3.11,21.77,1.11,1.62,8027.5,3227.8,1112.84,960.46,3.02,16.22,8.56,44.88,9.81
2,1997-10-03,1997,334.6,5.28,1.5,37.15,3.12,22.76,1.12,1.62,8038.6,3207.8,1124.44,965.03,3.03,16.16,8.56,44.94,10.06
3,1997-10-06,1997,332.3,5.2,1.45,37.15,2.98,21.93,1.11,1.62,8100.2,3250.0,1125.93,972.69,3.01,16.22,8.25,44.25,9.81
4,1997-10-07,1997,331.2,5.19,1.41,37.15,2.88,21.96,1.11,1.62,8178.3,3281.2,1140.74,983.12,3.04,16.12,8.38,44.25,9.5


(5281, 19)

##### functions

In [74]:
# pick data only from first entry until 5 September 2011 (see STEP 4 for detailed info regarding the decision)
data = data[data["date"] <= "2011-09-05"]

In [75]:
def add_year_month(df,col):
    df["year"] = pd.DatetimeIndex(df[col]).year
    df["month"] = pd.DatetimeIndex(df[col]).month
    return df

In [76]:
def pivot_by_date(df, col):
    dfnew = df.pivot_table(index = ["year","month"], values = col, aggfunc = "mean").reset_index()
    return dfnew

In [77]:
# function to add column that tells if the price goes up from the previous month,
# to get the delta price from previous month, and
# to print the approximate interval when price goes up and when price goes down

def see_trends(df, price_col, timerange):
    # get price prediction as list
    print(price_col+":")
    price = df[price_col].tolist()
    # price_up_trend shows how many day intervals in average does the price go up
    price_up_trend = []
    price_down_trend = []
    # up is a bool whether the price on that day goes up or not
    up = []
    # diff is the difference of price from today compared to yesterday
    diff = []
    price_up = 0
    price_down = 0
    for i in range(len(price)):
        try:
            if i == 0:
                diff.append(0)
                up.append(False)
            else:
                if (i != 0) & (price[i] >= price[i-1]):
                    price_up +=1
                    up.append(True)
                    price_down_trend.append(price_down)
                    price_down = 0
                elif (i != 0) & (price[i] < price[i-1]):
                    price_down += 1
                    up.append(False)
                    price_up_trend.append(price_up)
                    price_up = 0
                diff.append(price[i] - price[i-1])
        except:
            up.append(False)
            diff.append(0)
            
    df["price_goes_up"] = up
    df["delta_price"] = diff
    display(df["price_goes_up"].value_counts())
    # this is the average days interval where price starts to go up again
    price_up_trend_mean = np.mean(price_up_trend)
    price_down_trend_mean = np.mean(price_down_trend)
    print("price goes up every",round(price_up_trend_mean,1), timerange)
    print("price goes down every",round(price_down_trend_mean,1),timerange)
    display(df.head())
    return df

In [78]:
def growth(df,col1, col2):
    df["growth"] = (df[col1]/df[col2])
    return df

##### run functions

In [79]:
# add year and month column
data = add_year_month(data, "date")
# make pivot table for each gold and silver
pivot_gold = pivot_by_date(data, "gold")
pivot_silver = pivot_by_date(data, "silver")

In [80]:
# checking the shape of both table
display(pivot_gold.head())
display(pivot_silver.head())

Unnamed: 0,year,month,gold
0,1997,10,324.095652
1,1997,11,305.6325
2,1997,12,289.128261
3,1998,1,289.37619
4,1998,2,297.4525


Unnamed: 0,year,month,silver
0,1997,10,5.019565
1,1997,11,5.0965
2,1997,12,5.863043
3,1998,1,5.889524
4,1998,2,6.797


In [81]:
# the "timerange" is just helping in the print part. no dramatic actions intended
timerange = "month"
# update dataframe with more columns 
pivot_gold = see_trends(pivot_gold, "gold", timerange)
pivot_silver = see_trends(pivot_silver, "silver", timerange)

gold:


True     97
False    71
Name: price_goes_up, dtype: int64

price goes up every 1.3 month
price goes down every 0.7 month


Unnamed: 0,year,month,gold,price_goes_up,delta_price
0,1997,10,324.095652,False,0.0
1,1997,11,305.6325,False,-18.463152
2,1997,12,289.128261,False,-16.504239
3,1998,1,289.37619,True,0.24793
4,1998,2,297.4525,True,8.07631


silver:


True     94
False    74
Name: price_goes_up, dtype: int64

price goes up every 1.2 month
price goes down every 0.8 month


Unnamed: 0,year,month,silver,price_goes_up,delta_price
0,1997,10,5.019565,False,0.0
1,1997,11,5.0965,True,0.076935
2,1997,12,5.863043,True,0.766543
3,1998,1,5.889524,True,0.02648
4,1998,2,6.797,True,0.907476


In [82]:
pivot_gold = growth(pivot_gold, "delta_price","gold")
pivot_silver = growth(pivot_silver, "delta_price", "silver")

In [83]:
display(pivot_gold.head(5))
display(pivot_silver.head(5))
# delta_price is price of this month minus price of previous month
# growth is delta_price / price times 100. so kind of like percentage

Unnamed: 0,year,month,gold,price_goes_up,delta_price,growth
0,1997,10,324.095652,False,0.0,0.0
1,1997,11,305.6325,False,-18.463152,-0.06041
2,1997,12,289.128261,False,-16.504239,-0.057083
3,1998,1,289.37619,True,0.24793,0.000857
4,1998,2,297.4525,True,8.07631,0.027152


Unnamed: 0,year,month,silver,price_goes_up,delta_price,growth
0,1997,10,5.019565,False,0.0,0.0
1,1997,11,5.0965,True,0.076935,0.015096
2,1997,12,5.863043,True,0.766543,0.130742
3,1998,1,5.889524,True,0.02648,0.004496
4,1998,2,6.797,True,0.907476,0.133511


##### Hypothesis H0: price growth rate of gold is lower than or equal price growth rate of silver

In [84]:
import scipy.stats
confidence_level1 = 0.95
degrees_freedom1 = (pivot_gold.shape[0]) - 1
sample_mean1 = pivot_gold["growth"].mean()
sample_standard_error1 = scipy.stats.sem(pivot_gold["growth"])
confidence_interval1 = scipy.stats.t.interval(confidence_level1, 
                                             degrees_freedom1, 
                                             sample_mean1, 
                                             sample_standard_error1)

In [85]:
print( 'confidence interval:', confidence_interval1, '.' )

confidence interval: (0.003657672482962734, 0.01556174704765334) .


In [86]:
from scipy.stats import ttest_1samp
stat1, pval1 = ttest_1samp(pivot_gold["growth"], pivot_silver.growth.mean())

In [87]:
print('stat\t\t\t\t:', stat1)
print('pvalue for the two-tailed test\t:', pval1/2)

stat				: -0.20713555405492734
pvalue for the two-tailed test	: 0.4180781745458262


we can see here that the p-value exceeds .05 <br>
this means we fail to reject the H0 hypothesis. <br>
Also when we look at the stat, it is showing a negative value, which tells us that <br>
the gold growth rate is actually lower than silver growth rate. <br>
This means that we fail to reject the two statements in the H0 hypothesis, where we said that <br>
the gold growth rate is lower OR same equal the silver growth rate.