# Generate Unique CSV

In [5]:
import pandas as pd
import os

file_path = 'data/sp500/SP500.csv'
folder_path = "data/sp500/csv/"
output_file_path = 'data/sp500/SP500.csv'

def add_ticker_and_load_csv(file_path):
    ticker = os.path.basename(file_path).split('.')[0]
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
    df.insert(0,"Ticker",ticker)
    return df


csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]
combined_df = pd.concat((add_ticker_and_load_csv(file) for file in csv_files), ignore_index=True)
sorted_df = combined_df.sort_values(['Ticker', 'Date'])

sorted_df.to_csv(output_file_path, index=False)

# Read Data

In [10]:
df = pd.read_csv(output_file_path)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3266000 entries, 0 to 3265999
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Ticker          object 
 1   Date            object 
 2   Low             float64
 3   Open            float64
 4   Volume          float64
 5   High            float64
 6   Close           float64
 7   Adjusted Close  float64
dtypes: float64(6), object(2)
memory usage: 199.3+ MB


In [11]:
df.describe()

Unnamed: 0,Low,Open,Volume,High,Close,Adjusted Close
count,3258423.0,3258423.0,3258423.0,3258423.0,3258423.0,3258423.0
mean,382.1284,384.936,5064393.0,388.1214,385.1922,378.3433
std,8803.883,8874.08,30723090.0,8933.282,8868.704,8868.953
min,1e-05,0.0,0.0,1e-05,1e-05,-279.9133
25%,7.9,7.68,365800.0,8.125,8.0,4.141712
50%,23.45833,23.69754,1232220.0,24.05688,23.77,15.95139
75%,51.96863,52.43404,3357400.0,53.17,52.58,42.32024
max,533345.0,544389.0,7421641000.0,544389.0,539180.0,539180.0


In [19]:
print("Number of null values per column:")
counts = df.count().to_frame()
counts.columns = ['count']
counts['nulls'] = df.shape[0] - counts['count']
counts

Number of null values per column:


Unnamed: 0,count,nulls
Ticker,3266000,0
Date,3266000,0
Low,3258423,7577
Open,3258423,7577
Volume,3258423,7577
High,3258423,7577
Close,3258423,7577
Adjusted Close,3258423,7577


In [26]:
#check it there are rows with the same ticker and date
df.duplicated(['Ticker', 'Date']).sum()

0

In [29]:
missing_values_by_ticker = df.groupby('Ticker').apply(lambda x: x.isnull().sum())
print(missing_values_by_ticker)

        Ticker  Date  Low  Open  Volume  High  Close  Adjusted Close
Ticker                                                              
A            0     0    0     0       0     0      0               0
AAL          0     0    0     0       0     0      0               0
AAP          0     0    0     0       0     0      0               0
AAPL         0     0    0     0       0     0      0               0
ABBV         0     0    0     0       0     0      0               0
...        ...   ...  ...   ...     ...   ...    ...             ...
XYL          0     0    0     0       0     0      0               0
YUM          0     0    0     0       0     0      0               0
ZBH          0     0    0     0       0     0      0               0
ZION         0     0    0     0       0     0      0               0
ZTS          0     0    0     0       0     0      0               0

[409 rows x 8 columns]
