In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/home/msjahid/Downloads/walmart.csv')

In [3]:
df.sample(5)

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
1264,2,A,32,2010-05-07,14371.49,False,21.822222,0.748928,8.2
2953,6,A,23,2010-04-02,31044.79,False,18.3,0.718284,7.092
6570,19,A,25,2010-06-04,15043.75,False,20.488889,0.794102,8.185
1102,2,A,18,2010-12-03,71185.71,False,9.627778,0.715378,8.163
2064,4,A,24,2010-04-02,11589.6,False,9.316667,0.723832,7.896


In [4]:
# Check for null values in each column
df.isnull().any()

store                   False
type                    False
department              False
date                    False
weekly_sales            False
is_holiday              False
temperature_c           False
fuel_price_usd_per_l    False
unemployment            False
dtype: bool

<b>Decesion:</b> As all columns are showing False, which means that there are no null values present in any of the columns including 'store', 'type', 'department', 'date', 'weekly_sales', 'is_holiday', 'temperature_c', 'fuel_price_usd_per_l', 'unemployment'

In [5]:
# Check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10774 entries, 0 to 10773
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   store                 10774 non-null  int64  
 1   type                  10774 non-null  object 
 2   department            10774 non-null  int64  
 3   date                  10774 non-null  object 
 4   weekly_sales          10774 non-null  float64
 5   is_holiday            10774 non-null  bool   
 6   temperature_c         10774 non-null  float64
 7   fuel_price_usd_per_l  10774 non-null  float64
 8   unemployment          10774 non-null  float64
dtypes: bool(1), float64(4), int64(2), object(2)
memory usage: 684.0+ KB


<b>Decesion:</b> Date column is showing as an object format. We can handle it two different ways
1. split string
2. change the data type object to DateTime

In [6]:
# Number one solution is 

# copy the original data frame for implementing solution one

data = df.copy()

# Split the date column into year, month, and day parts
data[['year', 'month', 'day']] = data['date'].str.split('-', expand=True)

# Reorder the parts as desired and create a new formatted date column
data['formatted_date'] = data['day'] + '-' + data['month'] + '-' + data['year']

print(data)

       store type  department        date  weekly_sales  is_holiday  \
0          1    A           1  2010-02-05      24924.50       False   
1          1    A           1  2010-03-05      21827.90       False   
2          1    A           1  2010-04-02      57258.43       False   
3          1    A           1  2010-05-07      17413.94       False   
4          1    A           1  2010-06-04      17558.09       False   
...      ...  ...         ...         ...           ...         ...   
10769     39    A          99  2011-12-09        895.00       False   
10770     39    A          99  2012-02-03        350.00       False   
10771     39    A          99  2012-06-08        450.00       False   
10772     39    A          99  2012-07-13          0.06       False   
10773     39    A          99  2012-10-05        915.00       False   

       temperature_c  fuel_price_usd_per_l  unemployment  year month day  \
0           5.727778              0.679451         8.106  2010    02  0

In [7]:
# 2. The last solution

# Attempt to convert 'date' column to datetime format
try:
    df['date'] = pd.to_datetime(df['date'])
    print("All values in 'date' column are valid dates.")
except ValueError as e:
    print("Error:", e)
    print("There are non-date values present in the 'date' column.")

All values in 'date' column are valid dates.


In [8]:
df.info()
# Now date data type is going to datetime

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10774 entries, 0 to 10773
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   store                 10774 non-null  int64         
 1   type                  10774 non-null  object        
 2   department            10774 non-null  int64         
 3   date                  10774 non-null  datetime64[ns]
 4   weekly_sales          10774 non-null  float64       
 5   is_holiday            10774 non-null  bool          
 6   temperature_c         10774 non-null  float64       
 7   fuel_price_usd_per_l  10774 non-null  float64       
 8   unemployment          10774 non-null  float64       
dtypes: bool(1), datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 684.0+ KB


In [9]:
df['date'] = df['date'].dt.strftime('%d-%m-%Y')  # Example format: YYYY-MM-DD

In [10]:
df.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,05-02-2010,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,05-03-2010,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,02-04-2010,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,07-05-2010,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,04-06-2010,17558.09,False,27.05,0.714586,7.808
