In [39]:
import pandas as pd
import datetime

In [42]:
df = pd.read_csv('walmart_stock_prices.csv', parse_dates=['Date'])

# 1. How big is the data?

In [43]:
df.shape

(13233, 8)

# 2. How does the data look like

In [44]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1972-08-25 00:00:00-04:00,0.011463,0.011683,0.011463,0.011639,7526400,0.0,0.0
1,1972-08-28 00:00:00-04:00,0.011639,0.011727,0.011595,0.011595,2918400,0.0,0.0
2,1972-08-29 00:00:00-04:00,0.011551,0.011551,0.011463,0.011463,5836800,0.0,0.0
3,1972-08-30 00:00:00-04:00,0.011463,0.011463,0.011374,0.011463,1228800,0.0,0.0
4,1972-08-31 00:00:00-04:00,0.011374,0.011374,0.011286,0.011286,2611200,0.0,0.0


### Here, sample can be used to take random rows

In [45]:
df.sample(6)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
4398,1990-01-24 00:00:00-05:00,1.052037,1.086893,1.04253,1.080556,20455200,0.0,0.0
5783,1995-07-18 00:00:00-04:00,2.788972,2.828069,2.723809,2.788972,27587400,0.0,0.0
7884,2003-11-18 00:00:00-05:00,12.059,12.122572,11.953777,11.995428,30298500,0.0,0.0
9856,2011-09-19 00:00:00-04:00,13.150017,13.260903,13.114735,13.21806,27086700,0.0,0.0
4828,1991-10-07 00:00:00-04:00,2.3831,2.427823,2.370322,2.376711,11060400,0.0,0.0
3864,1987-12-14 00:00:00-05:00,0.583007,0.62062,0.583007,0.614351,21036000,0.0,0.0


# 3. What is the datatype of cols?

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13233 entries, 0 to 13232
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          13233 non-null  object 
 1   Open          13233 non-null  float64
 2   High          13233 non-null  float64
 3   Low           13233 non-null  float64
 4   Close         13233 non-null  float64
 5   Volume        13233 non-null  int64  
 6   Dividends     13233 non-null  float64
 7   Stock Splits  13233 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 827.2+ KB


# 4. Are there any missing values?

In [47]:
df.isnull().sum()

Date            0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
dtype: int64

If instead of 0 there are other values it means that much spaces are left blank in the particular column

# 5. How does the data look Mathematically

In [48]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
count,13233.0,13233.0,13233.0,13233.0,13233.0,13233.0,13233.0
mean,11.942685,12.047041,11.840625,11.946184,22441840.0,0.000922,0.001587
std,15.85759,15.979783,15.73979,15.865461,18390040.0,0.011606,0.058295
min,0.002839,0.002839,0.002617,0.002839,0.0,0.0,0.0
25%,0.331929,0.335493,0.32917,0.332387,11902200.0,0.0,0.0
50%,7.361626,7.430911,7.237296,7.350794,18756900.0,0.0,0.0
75%,14.864691,15.031126,14.768921,14.910342,28018500.0,0.0,0.0
max,105.300003,105.300003,103.599998,105.050003,395500800.0,0.208,3.0


# 6. Are there any duplicat values?

In [49]:
df.duplicated().sum()

np.int64(0)

To remove the duplicate values use the drop_duplicates() function

In [50]:
df.drop_duplicates() 

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1972-08-25 00:00:00-04:00,0.011463,0.011683,0.011463,0.011639,7526400,0.0,0.0
1,1972-08-28 00:00:00-04:00,0.011639,0.011727,0.011595,0.011595,2918400,0.0,0.0
2,1972-08-29 00:00:00-04:00,0.011551,0.011551,0.011463,0.011463,5836800,0.0,0.0
3,1972-08-30 00:00:00-04:00,0.011463,0.011463,0.011374,0.011463,1228800,0.0,0.0
4,1972-08-31 00:00:00-04:00,0.011374,0.011374,0.011286,0.011286,2611200,0.0,0.0
...,...,...,...,...,...,...,...,...
13228,2025-02-14 00:00:00-05:00,105.300003,105.300003,103.599998,104.040001,14109500,0.0,0.0
13229,2025-02-18 00:00:00-05:00,103.720001,103.989998,102.510002,103.779999,18247300,0.0,0.0
13230,2025-02-19 00:00:00-05:00,103.849998,104.199997,102.550003,104.000000,18508000,0.0,0.0
13231,2025-02-20 00:00:00-05:00,98.779999,100.120003,96.680000,97.209999,55450900,0.0,0.0


# 7. How is the correlation between the cols

In [58]:
df['Date'] = pd.to_datetime(df['Date'], utc=True).map(pd.Timestamp.toordinal)


In [59]:
df.corr()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,1.0,0.82352,0.823705,0.823437,0.82343,0.394639,0.087892,-0.012489
Open,0.82352,1.0,0.999954,0.999955,0.999911,0.143856,0.083874,-0.003507
High,0.823705,0.999954,1.0,0.999936,0.999958,0.145541,0.083625,-0.003396
Low,0.823437,0.999955,0.999936,1.0,0.999956,0.142087,0.083728,-0.003623
Close,0.82343,0.999911,0.999958,0.999956,1.0,0.143606,0.083656,-0.003321
Volume,0.394639,0.143856,0.145541,0.142087,0.143606,1.0,0.026258,-0.000241
Dividends,0.087892,0.083874,0.083625,0.083728,0.083656,0.026258,1.0,-0.002164
Stock Splits,-0.012489,-0.003507,-0.003396,-0.003623,-0.003321,-0.000241,-0.002164,1.0
