In [4]:
import pandas as pd
import numpy as np

# Paths
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"

# Load
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print("Train shape:", train.shape)
print("Test shape:", test.shape)

display(train.head())
display(test.head())

# -------------------------
# Basic info
# -------------------------
print("\nTRAIN INFO")
train.info()

print("\nTEST INFO")
test.info()

# -------------------------
# Missing values
# -------------------------
print("\nMissing values (Train)")
print(train.isnull().sum())

print("\nMissing values (Test)")
print(test.isnull().sum())

# -------------------------
# Duplicates
# -------------------------
print("\nDuplicate rows (Train):", train.duplicated().sum())
print("Duplicate rows (Test):", test.duplicated().sum())

# -------------------------
# Sales sanity check
# -------------------------
if "sales" in train.columns:
    print("\nNegative sales count:", (train["sales"] < 0).sum())

# -------------------------
# Date checks
# -------------------------
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

print("\nTrain date range:", train["date"].min(), "→", train["date"].max())
print("Test date range:", test["date"].min(), "→", test["date"].max())

# -------------------------
# Time continuity check
# -------------------------
train_sorted = train.sort_values(["store", "item", "date"])

date_gaps = (
    train_sorted.groupby(["store", "item"])["date"]
    .diff()
    .dt.days
    .value_counts()
)

print("\nDate gaps distribution (days):")
print(date_gaps.head(10))



Train shape: (913000, 4)
Test shape: (45000, 4)


Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1



TRAIN INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    913000 non-null  object
 1   store   913000 non-null  int64 
 2   item    913000 non-null  int64 
 3   sales   913000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 27.9+ MB

TEST INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      45000 non-null  int64 
 1   date    45000 non-null  object
 2   store   45000 non-null  int64 
 3   item    45000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.4+ MB

Missing values (Train)
date     0
store    0
item     0
sales    0
dtype: int64

Missing values (Test)
id       0
date     0
store    0
item     0
dtype: int64

Duplicate rows (Train): 0
Duplicate rows (Test): 0

Negati