In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [87]:
# import data from the data sources.
folder_path = "assignment-datasets/"
df = pd.read_csv(folder_path + "Stock_File_1.csv")
df2 = pd.read_csv(folder_path + "Stock_File_2.txt")

In [88]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1-Jun-06,471.6,474.0,442.0,444.42,21900
1,12-Jun-06,454.0,464.0,440.0,446.17,8400
2,22-Jun-06,451.16,464.2,447.6,460.26,19400
3,3-Jul-06,495.1,509.68,493.0,498.97,9100
4,13-Jul-06,518.0,526.4,517.0,521.66,6800


In [89]:
df2.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,3-Jan-11,798.0,824.95,796.0,810.55,17600
1,13-Jan-11,748.1,768.0,,752.85,13000
2,24-Jan-11,741.0,626.01,732.55,,8700
3,1-Feb-11,753.0,753.0,712.1,717.25,23700
4,11-Feb-11,647.9,654.55,626.35,642.4,12800


In [90]:
# check for null values in both the data frames.
null_count_1 = df.isnull().sum()
null_count_2 = df2.isnull().sum()
print(
    f"""
    {null_count_1}
    {null_count_2}
    """
)


    Date      0
Open      4
High      7
Low       6
Close     7
Volume    0
dtype: int64
    Date      0
Open      3
High      9
Low       7
Close     7
Volume    0
dtype: int64
    


In [91]:
# concat the two data frames.
df_concat = pd.concat([df, df2], axis=0)
df_concat.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1-Jun-06,471.6,474.0,442.0,444.42,21900
1,12-Jun-06,454.0,464.0,440.0,446.17,8400
2,22-Jun-06,451.16,464.2,447.6,460.26,19400
3,3-Jul-06,495.1,509.68,493.0,498.97,9100
4,13-Jul-06,518.0,526.4,517.0,521.66,6800


In [92]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 0 to 203
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    369 non-null    object 
 1   Open    362 non-null    float64
 2   High    353 non-null    float64
 3   Low     356 non-null    float64
 4   Close   355 non-null    float64
 5   Volume  369 non-null    object 
dtypes: float64(4), object(2)
memory usage: 20.2+ KB


In [93]:
# Fill the null values in the high Column with the mean of the High Column.
df_concat["High"].fillna(df_concat["High"].mean(), inplace=True)
# Fill the null values in the Low Column with the mean of the Low Column.
df_concat["Low"].fillna(df_concat["Low"].mean(), inplace=True)
# Fill the null values in the Close Column with the mean of the Close Column.
df_concat["Close"].fillna(df_concat["Close"].mean(), inplace=True)
# Fill the null values in the Open Column with the mean of the Open Column.
df_concat["Open"].fillna(df_concat["Open"].mean(), inplace=True)
# check if null values exists.
df_concat.isnull().sum()

Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [94]:
df_concat[df_concat["Volume"] == "zero"]

Unnamed: 0,Date,Open,High,Low,Close,Volume
103,10-Apr-09,458.75,458.75,458.75,458.75,zero
105,1-May-09,1447.35,487.35,487.35,487.35,zero
119,21-Sep-09,1755.8,795.8,795.8,795.8,zero
123,2-Nov-09,1727.85,767.85,767.85,767.85,zero
129,1-Jan-10,767.25,767.25,767.25,767.25,zero
48,1-May-12,1778.0,818.0,818.0,818.0,zero
84,1-May-13,519.55,519.55,519.55,519.55,zero
120,1-May-14,1800.45,840.45,840.45,840.45,zero
156,1-May-15,2291.95,1331.95,1331.95,1331.95,zero
173,22-Oct-15,1237.75,1237.75,1237.75,1237.75,zero


In [95]:
# Clean Values in Volume Column.

# Find the rows in Volume column where the value is "zero"
df_concat['Volume'].replace('zero', 0, inplace=True)

In [96]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 0 to 203
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    369 non-null    object 
 1   Open    369 non-null    float64
 2   High    369 non-null    float64
 3   Low     369 non-null    float64
 4   Close   369 non-null    float64
 5   Volume  369 non-null    object 
dtypes: float64(4), object(2)
memory usage: 20.2+ KB


In [97]:
# Convert the dtype of Volume column from object to Float
df_concat["Volume"] = df_concat["Volume"].astype(float)
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 0 to 203
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    369 non-null    object 
 1   Open    369 non-null    float64
 2   High    369 non-null    float64
 3   Low     369 non-null    float64
 4   Close   369 non-null    float64
 5   Volume  369 non-null    float64
dtypes: float64(5), object(1)
memory usage: 20.2+ KB


In [98]:
# check for rows where the High value is greater than the Low value.
print(len(df_concat[df_concat["Low"] > df_concat["High"]]))
df_concat[df_concat["Low"] > df_concat["High"]]


35


Unnamed: 0,Date,Open,High,Low,Close,Volume
12,3-Oct-06,591.0,592.0,856.357472,585.96,6200.0
14,23-Oct-06,674.14,667.0,674.14,704.45,65700.0
24,1-Feb-07,729.8,551.02,722.8,760.55,49700.0
32,23-Apr-07,780.0,642.0,772.0,781.63,35400.0
45,3-Sep-07,1131.0,493.0,1131.0,1172.4,165800.0
52,12-Nov-07,1575.0,644.0,1500.0,1548.0,35500.0
53,22-Nov-07,1550.0,709.6,1453.0,1525.0,30500.0
54,3-Dec-07,1580.0,387.6,1558.0,1602.95,135000.0
73,12-Jun-08,980.0,858.892918,945.0,963.3,43000.0
86,21-Oct-08,651.0,678.0,856.357472,673.05,51700.0


In [99]:
# remove all the rows where the High value is greater than the Low value in df_concat
df_concat.drop(
    df_concat[df_concat["Low"] > df_concat["High"]].index, inplace=True
)
df_concat[df_concat["Low"] > df_concat["High"]]


Unnamed: 0,Date,Open,High,Low,Close,Volume


In [100]:
df_concat.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,304.0,304.0,304.0,304.0,304.0
mean,882.283429,883.440086,845.482236,859.99397,38523.684211
std,286.703267,273.894038,267.208456,266.346216,67786.172482
min,364.0,366.0,354.0,363.6,0.0
25%,688.75,695.95,675.0375,684.325,6800.0
50%,814.925,824.975,789.5,811.85,16250.0
75%,1101.2875,1114.0,1050.0,1064.9625,46300.0
max,1800.45,1650.0,1600.05,1607.85,843600.0


In [101]:
# export df_concat
df_concat.to_csv(folder_path + "Stock_File_concat_cleaned.csv", index=False)
