# Monthly Time-Series Data for Commodity Prices

### Munish Kumar

This data set contains commodity prices of agricultural raw products. I want to try and perform some time series analysis and see if I can forcast future prices.

#### Import Libraries

In [1]:
# General Libraries
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import NullFormatter
import time
import re
import requests
import pickle
import seaborn as sns
import os
import glob
import sys
sns.set()

# Sklearn Liraries
from sklearn import preprocessing

import datetime
from datetime import timedelta, date 
start = time.time()
%matplotlib inline

# Forces the print statement to show everything and not truncate
# np.set_printoptions(threshold=sys.maxsize) 
print('Libraries imported')

Libraries imported


Declare some global variables

In [2]:
dir_name = 'C:/Users/quant/Anaconda3/@Projects/Time_Series_Commodities'
filename_suffix = 'csv'

Read in the data file

In [3]:
df = pd.read_csv('datasets_677484_1190624_agricultural_raw_material.csv', thousands=',')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361 entries, 0 to 360
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Month                         361 non-null    object 
 1   Coarse wool Price             327 non-null    float64
 2   Coarse wool price % Change    327 non-null    object 
 3   Copra Price                   339 non-null    float64
 4   Copra price % Change          339 non-null    object 
 5   Cotton Price                  361 non-null    float64
 6   Cotton price % Change         361 non-null    object 
 7   Fine wool Price               327 non-null    float64
 8   Fine wool price % Change      327 non-null    object 
 9   Hard log Price                361 non-null    float64
 10  Hard log price % Change       361 non-null    object 
 11  Hard sawnwood Price           327 non-null    float64
 12  Hard sawnwood price % Change  327 non-null    object 
 13  Hide 

#### Dealing with NaN values

In [4]:
#Total number of values
print ('Total Data Points:\n', df.count().sum())
print('\n')

#Total number of values
print ('Total Data Points per column:\n', df.count())
print('\n')

# Any missing values?
print ('Missing Values?', df.isnull().values.any())
print('\n')

# Total number of missing values
print ('Total Count of Missing values:', df.isnull().sum().sum())
print('\n')

# Total missing values for each feature
print ('Number of Points Missing Per Column:')
print (df.isnull().sum())
print('\n')
       
# % of missing values for each feature
print ('Percentage of Points Missing Per Column:')
print (round(df.isnull().sum()/df.count()*100),3)
print('\n')
       
# % missing values altogether
print ('Percentage of Points Missing in total:', 
       round(df.isnull().sum().sum()/df.count().sum()*100))
print('\n')       

Total Data Points:
 8571


Total Data Points per column:
 Month                           361
Coarse wool Price               327
Coarse wool price % Change      327
Copra Price                     339
Copra price % Change            339
Cotton Price                    361
Cotton price % Change           361
Fine wool Price                 327
Fine wool price % Change        327
Hard log Price                  361
Hard log price % Change         361
Hard sawnwood Price             327
Hard sawnwood price % Change    327
Hide Price                      327
Hide price % change             327
Plywood Price                   361
Plywood price % Change          361
Rubber Price                    361
Rubber price % Change           361
Softlog Price                   327
Softlog price % Change          327
Soft sawnwood Price             327
Soft sawnwood price % Change    327
Wood pulp Price                 360
Wood pulp price % Change        360
dtype: int64


Missing Values? True


Tota

It looks like for some columns, 10% of the data is missing. However, overall, only 5% of the data is NaN.

In [5]:
null_data = df[df.isnull().any(axis=1)]
null_data

Unnamed: 0,Month,Coarse wool Price,Coarse wool price % Change,Copra Price,Copra price % Change,Cotton Price,Cotton price % Change,Fine wool Price,Fine wool price % Change,Hard log Price,...,Plywood Price,Plywood price % Change,Rubber Price,Rubber price % Change,Softlog Price,Softlog price % Change,Soft sawnwood Price,Soft sawnwood price % Change,Wood pulp Price,Wood pulp price % Change
327,Jul-17,,,1059.0,-5.36%,1.85,-1.07%,,,264.62,...,485.38,-1.40%,1.75,1.74%,,,,,875.0,0.00%
328,Aug-17,,,1062.0,0.28%,1.75,-5.41%,,,270.96,...,497.0,2.39%,1.84,5.14%,,,,,875.0,0.00%
329,Sep-17,,,1015.0,-4.43%,1.78,1.71%,,,268.93,...,493.27,-0.75%,1.86,1.09%,,,,,875.0,0.00%
330,Oct-17,,,989.0,-2.56%,1.73,-2.81%,,,263.62,...,483.53,-1.97%,1.64,-11.83%,,,,,875.0,0.00%
331,Nov-17,,,1038.0,4.95%,1.77,2.31%,,,263.85,...,483.96,0.09%,1.57,-4.27%,,,,,875.0,0.00%
332,Dec-17,,,958.0,-7.71%,1.88,6.21%,,,263.62,...,483.53,-0.09%,1.65,5.10%,,,,,875.0,0.00%
333,Jan-18,,,942.5,-1.62%,2.01,6.91%,,,268.3,...,492.12,1.78%,1.72,4.24%,,,,,875.0,0.00%
334,Feb-18,,,835.0,-11.41%,1.95,-2.99%,,,275.8,...,505.88,2.80%,1.72,0.00%,,,,,875.0,0.00%
335,Mar-18,,,745.0,-10.78%,2.03,4.10%,,,280.72,...,514.9,1.78%,1.76,2.33%,,,,,875.0,0.00%
336,Apr-18,,,756.0,1.48%,2.03,0.00%,,,276.7,...,507.53,-1.43%,1.73,-1.70%,,,,,875.0,0.00%


A final check shows that all the data that is missing is located in the last few rows. I will drop this and move on with the analysis.

In [6]:
df_preproc = df.dropna(axis = 0)
print ('Missing Values?', df_preproc.isnull().values.any())

Missing Values? False


In [7]:
# Column Names
df_preproc.columns.values

array(['Month', 'Coarse wool Price', 'Coarse wool price % Change',
       'Copra Price', 'Copra price % Change', 'Cotton Price',
       'Cotton price % Change', 'Fine wool Price',
       'Fine wool price % Change', 'Hard log Price',
       'Hard log price % Change', 'Hard sawnwood Price',
       'Hard sawnwood price % Change', 'Hide Price',
       'Hide price % change', 'Plywood Price', 'Plywood price % Change',
       'Rubber Price', 'Rubber price % Change', 'Softlog Price',
       'Softlog price % Change', 'Soft sawnwood Price',
       'Soft sawnwood price % Change', 'Wood pulp Price',
       'Wood pulp price % Change'], dtype=object)

In [8]:
# Drop all % increase files and recompute. 
# (For some reason, manipulation of the object frame was problematic)

cols = [
    'Coarse wool price % Change',
    'Copra price % Change', 
    'Cotton price % Change',
    'Fine wool price % Change',
    'Hard log price % Change',
    'Hard sawnwood price % Change',
    'Hide price % change', 
    'Plywood price % Change',
    'Rubber price % Change',
    'Softlog price % Change',
    'Soft sawnwood price % Change',
    'Wood pulp price % Change'       
]

df_preproc = df_preproc.drop(cols, axis = 1)
df_preproc

Unnamed: 0,Month,Coarse wool Price,Copra Price,Cotton Price,Fine wool Price,Hard log Price,Hard sawnwood Price,Hide Price,Plywood Price,Rubber Price,Softlog Price,Soft sawnwood Price,Wood pulp Price
0,Apr-90,482.34,236.00,1.83,1071.63,161.20,549.91,100.00,312.36,0.84,120.66,218.76,829.29
1,May-90,447.26,234.00,1.89,1057.18,172.86,491.88,99.46,350.12,0.85,124.28,213.00,842.51
2,Jun-90,440.99,216.00,1.99,898.24,181.67,495.39,97.90,373.94,0.85,129.45,200.00,831.35
3,Jul-90,418.44,205.00,2.01,895.83,187.96,485.86,96.75,378.48,0.86,124.23,210.05,798.83
4,Aug-90,418.44,198.00,1.79,951.22,186.13,487.52,91.89,364.60,0.88,129.70,208.30,818.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,Feb-17,1029.58,1146.25,1.88,1368.14,263.45,680.49,76.58,483.23,2.71,157.58,287.43,875.00
323,Mar-17,1059.60,1016.00,1.91,1454.83,263.48,672.48,77.93,483.27,2.35,160.05,300.42,875.00
324,Apr-17,991.12,1044.00,1.92,1404.98,270.34,688.44,75.43,495.87,2.21,159.84,306.60,875.00
325,May-17,1019.95,1112.50,1.95,1433.47,265.28,704.52,69.36,486.59,2.10,159.84,306.60,875.00


#### Check the Month column, convert if necessary to time-series

In [9]:
df_preproc['Month'].describe()

count        327
unique       327
top       Oct-08
freq           1
Name: Month, dtype: object

In [10]:
# The 'MM-YR' format was incompatible with just a straight datetime conversion
# I need to convert to a list and then pass it to format codes (%Y, %m, %d etc)
# %b - first 3 letters of month 
# The strptime() class method takes two arguments: string (that be converted 
# to datetime) + format code & the method returns its equivalent datetime object.
# The strftime() method takes one or more format codes as an argument and returns
# a formatted string based on it. 

dfa=df_preproc['Month'].tolist()
# E.g. of Output is '1990-04-01'
#df_preproc['Month']=[datetime.datetime.strptime(x,'%b-%y') for x in dfa]
df_preproc['Month']=[datetime.datetime.strptime(x,'%b-%y').strftime('%Y-%m') for x in dfa]
df_preproc

Unnamed: 0,Month,Coarse wool Price,Copra Price,Cotton Price,Fine wool Price,Hard log Price,Hard sawnwood Price,Hide Price,Plywood Price,Rubber Price,Softlog Price,Soft sawnwood Price,Wood pulp Price
0,1990-04,482.34,236.00,1.83,1071.63,161.20,549.91,100.00,312.36,0.84,120.66,218.76,829.29
1,1990-05,447.26,234.00,1.89,1057.18,172.86,491.88,99.46,350.12,0.85,124.28,213.00,842.51
2,1990-06,440.99,216.00,1.99,898.24,181.67,495.39,97.90,373.94,0.85,129.45,200.00,831.35
3,1990-07,418.44,205.00,2.01,895.83,187.96,485.86,96.75,378.48,0.86,124.23,210.05,798.83
4,1990-08,418.44,198.00,1.79,951.22,186.13,487.52,91.89,364.60,0.88,129.70,208.30,818.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,2017-02,1029.58,1146.25,1.88,1368.14,263.45,680.49,76.58,483.23,2.71,157.58,287.43,875.00
323,2017-03,1059.60,1016.00,1.91,1454.83,263.48,672.48,77.93,483.27,2.35,160.05,300.42,875.00
324,2017-04,991.12,1044.00,1.92,1404.98,270.34,688.44,75.43,495.87,2.21,159.84,306.60,875.00
325,2017-05,1019.95,1112.50,1.95,1433.47,265.28,704.52,69.36,486.59,2.10,159.84,306.60,875.00


In [11]:
# From Text to Date
df_preproc['Month'] = pd.to_datetime(df_preproc['Month'])
df_preproc.head()

Unnamed: 0,Month,Coarse wool Price,Copra Price,Cotton Price,Fine wool Price,Hard log Price,Hard sawnwood Price,Hide Price,Plywood Price,Rubber Price,Softlog Price,Soft sawnwood Price,Wood pulp Price
0,1990-04-01,482.34,236.0,1.83,1071.63,161.2,549.91,100.0,312.36,0.84,120.66,218.76,829.29
1,1990-05-01,447.26,234.0,1.89,1057.18,172.86,491.88,99.46,350.12,0.85,124.28,213.0,842.51
2,1990-06-01,440.99,216.0,1.99,898.24,181.67,495.39,97.9,373.94,0.85,129.45,200.0,831.35
3,1990-07-01,418.44,205.0,2.01,895.83,187.96,485.86,96.75,378.48,0.86,124.23,210.05,798.83
4,1990-08-01,418.44,198.0,1.79,951.22,186.13,487.52,91.89,364.6,0.88,129.7,208.3,818.74


In [12]:
df_preproc['Month'].describe()

count                     327
unique                    327
top       2009-09-01 00:00:00
freq                        1
first     1990-04-01 00:00:00
last      2017-06-01 00:00:00
Name: Month, dtype: object

In [13]:
# Set Month as the Index
df_preproc.set_index("Month", inplace = True)

#### Setting the desired frequency

I am changing the frequency of the data set so that it is on a monthly basis; also I want to start of each month to be the first of the month. 

In [14]:
df_preproc  = df_preproc.asfreq('MS')
df_preproc.describe(include='all')

Unnamed: 0,Coarse wool Price,Copra Price,Cotton Price,Fine wool Price,Hard log Price,Hard sawnwood Price,Hide Price,Plywood Price,Rubber Price,Softlog Price,Soft sawnwood Price,Wood pulp Price
count,327.0,327.0,327.0,327.0,327.0,327.0,327.0,327.0,327.0,327.0,327.0,327.0
mean,626.333731,529.148532,1.621376,850.119572,248.984343,707.950367,78.566667,509.422691,1.660765,164.527462,291.061713,678.674373
std,299.638838,264.097498,0.532764,285.075196,68.621751,144.563241,13.690623,93.685312,1.067676,25.596308,34.113959,158.292658
min,247.09,182.0,0.82,417.47,133.28,413.37,28.59,312.36,0.49,119.35,183.61,384.0
25%,369.62,368.0,1.275,646.345,194.695,573.47,69.495,434.055,0.84,145.97,277.59,544.705
50%,525.07,449.0,1.54,748.18,247.32,728.71,77.25,512.34,1.33,160.37,294.96,662.54
75%,847.12,656.75,1.83,1019.87,286.97,831.635,86.0,581.69,2.155,180.21,310.865,832.17
max,1391.47,1503.0,5.06,1865.44,520.81,973.6,114.63,751.81,6.26,259.97,372.6,966.49


After all that manipulation, we see that there is not any new data created. The main thing is that now we are sure the data is monthly, and the the month column is the index, in the right date-time format. 

In [15]:
df_preproc.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 327 entries, 1990-04-01 to 2017-06-01
Freq: MS
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Coarse wool Price    327 non-null    float64
 1   Copra Price          327 non-null    float64
 2   Cotton Price         327 non-null    float64
 3   Fine wool Price      327 non-null    float64
 4   Hard log Price       327 non-null    float64
 5   Hard sawnwood Price  327 non-null    float64
 6   Hide Price           327 non-null    float64
 7   Plywood Price        327 non-null    float64
 8   Rubber Price         327 non-null    float64
 9   Softlog Price        327 non-null    float64
 10  Soft sawnwood Price  327 non-null    float64
 11  Wood pulp Price      327 non-null    float64
dtypes: float64(12)
memory usage: 33.2 KB


I now create a second data frame with all the % differences that I dropped earlier. If needed, this 2nd dataframe will will be available for use in the machine learning algorithms I will discuss. For now, this is being written out as a checkpoint file

In [16]:
df_prcnt = df_preproc.pct_change(fill_method ='ffill') 
df_prcnt

Unnamed: 0_level_0,Coarse wool Price,Copra Price,Cotton Price,Fine wool Price,Hard log Price,Hard sawnwood Price,Hide Price,Plywood Price,Rubber Price,Softlog Price,Soft sawnwood Price,Wood pulp Price
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1990-04-01,,,,,,,,,,,,
1990-05-01,-0.072729,-0.008475,0.032787,-0.013484,0.072333,-0.105526,-0.005400,0.120886,0.011905,0.030002,-0.026330,0.015941
1990-06-01,-0.014019,-0.076923,0.052910,-0.150343,0.050966,0.007136,-0.015685,0.068034,0.000000,0.041600,-0.061033,-0.013246
1990-07-01,-0.051135,-0.050926,0.010050,-0.002683,0.034623,-0.019237,-0.011747,0.012141,0.011765,-0.040324,0.050250,-0.039117
1990-08-01,0.000000,-0.034146,-0.109453,0.061831,-0.009736,0.003417,-0.050233,-0.036673,0.023256,0.044031,-0.008331,0.024924
...,...,...,...,...,...,...,...,...,...,...,...,...
2017-02-01,0.001810,-0.064286,0.032967,0.060598,0.018755,0.012905,0.020386,0.018763,0.058594,-0.073930,-0.077331,0.000000
2017-03-01,0.029158,-0.113631,0.015957,0.063363,0.000114,-0.011771,0.017629,0.000083,-0.132841,0.015675,0.045194,0.000000
2017-04-01,-0.064628,0.027559,0.005236,-0.034265,0.026036,0.023733,-0.032080,0.026072,-0.059574,-0.001312,0.020571,0.000000
2017-05-01,0.029088,0.065613,0.015625,0.020278,-0.018717,0.023357,-0.080472,-0.018715,-0.049774,0.000000,0.000000,0.000000


#### Checkpoint File

In [17]:
base_filename = 'Clean_Data_raw'
csvs_sht = os.path.join(dir_name, base_filename + "." + filename_suffix)
df_preproc.to_csv(csvs_sht, index = True, header = True)
print ("Final File Extract Produced:", base_filename + "." + filename_suffix)

Final File Extract Produced: Clean_Data_raw.csv


In [18]:
base_filename = 'Clean_Data_percent'
csvs_sht = os.path.join(dir_name, base_filename + "." + filename_suffix)
df_prcnt.to_csv(csvs_sht, index = True, header = True)
print ("Final File Extract Produced:", base_filename + "." + filename_suffix)

Final File Extract Produced: Clean_Data_percent.csv


In [19]:
#plt.figure(figsize=(20, 5))
#groups = df.groupby(Grouper(freq = 'A'))
#years = dataFrame()
#for name, group in groups:
#    year[name.year]= groups.values
#years.plot(subplots=True, legend = False)
#
#plt.show()

## Conclusion

In [20]:
count = 'Completed Process'
elapsed = (time.time() - start)
print ("%s in %s seconds" % (count,elapsed))

Completed Process in 0.32114243507385254 seconds
