In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
folder_path = '/content/drive/My Drive/FYP/'

In [2]:
# Function to clean individual crop data
import pandas as pd

def clean_crop_data(file_path):
    # Load the dataset
    crop_data = pd.read_csv(file_path)

    # Step 1: Convert 'Date' to proper datetime format
    crop_data['Date'] = pd.to_datetime(crop_data['Date'], format='%d-%B-%Y', errors='coerce')

    # Step 2: Handle missing values in 'textbox2' (crop prices)
    crop_data['textbox2'] = crop_data['textbox2'].replace('NA', None)
    crop_data['textbox2'] = pd.to_numeric(crop_data['textbox2'].str.replace(',', ''), errors='coerce')

    # Step 3: Rename columns for clarity
    crop_data = crop_data.rename(columns={
        'textbox3': 'Location',
        'textbox2': 'Price',
        'CropNameUrdu': 'CropName'
    })

    # Step 4: Keep only the first word in the 'Location' column
    crop_data['Location'] = crop_data['Location'].str.split().str[0]

    # Step 5: Drop unnecessary columns ('Month' and 'Year')
    crop_data_cleaned = crop_data.drop(columns=['Month', 'Year'])

    # Step 6: Impute missing prices using interpolation
    crop_data_cleaned['Price'] = crop_data_cleaned['Price'].interpolate(method='linear', limit_direction='forward')

    return crop_data_cleaned

In [3]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import os

# Function to load and preprocess data
def load_and_prepare_data(file_path):
    data = pd.read_csv(file_path)

    # Handle any necessary cleaning (e.g., missing values)
    data['Price'].replace({'NA': None})
    data.dropna(subset=['Price'], inplace=True)
    # Encode categorical columns
    label_encoder_location = LabelEncoder()
    label_encoder_crop = LabelEncoder()

    data['Location_encoded'] = label_encoder_location.fit_transform(data['Location'])
    data['CropName_encoded'] = label_encoder_crop.fit_transform(data['CropName'])

    # Normalize prices (LSTM works better with normalized data)
    scaler = MinMaxScaler(feature_range=(0, 1))
    data['Price'] = scaler.fit_transform(data['Price'].values.reshape(-1, 1))

    # Convert date column to datetime format and extract useful features
    data['Date'] = pd.to_datetime(data['Date'])
    data['Day'] = data['Date'].dt.day
    data['Month'] = data['Date'].dt.month
    data['Year'] = data['Date'].dt.year

    return data, label_encoder_location, label_encoder_crop, scaler


**Resume**

In [4]:
# Load and concatenate multiple years of data
folder_path = '/content/drive/My Drive/FYP/'
all_data = pd.DataFrame()

for year in range(2007, 2025):
    file_path = f'{folder_path}/cleaned_{year}.csv'
    year_data, location_encoder, crop_encoder, scaler = load_and_prepare_data(file_path)
    all_data = pd.concat([all_data, year_data])


In [None]:
num_location = len(location_encoder.classes_)  # Should match the max value in Location_encoded
num_crop = len(crop_encoder.classes_)  # Should match the max value in CropName_encoded


In [None]:
# Check for any out-of-range indices
print(all_data['Location_encoded'].max(), len(location_encoder.classes_))
print(all_data['CropName_encoded'].max(), len(crop_encoder.classes_))


137 138
131 132


In [None]:
# Check that all unique locations and crops are encoded properly
location_mapping = dict(zip(location_encoder.classes_, range(len(location_encoder.classes_))))
crop_mapping = dict(zip(crop_encoder.classes_, range(len(crop_encoder.classes_))))

# Apply encoding (if it wasn't already done correctly)
all_data['Location_encoded'] = all_data['Location'].map(location_mapping)
all_data['CropName_encoded'] = all_data['CropName'].map(crop_mapping)

# Ensure no missing mappings
assert all_data['Location_encoded'].notna().all(), "Some locations were not mapped correctly"
assert all_data['CropName_encoded'].notna().all(), "Some crops were not mapped correctly"


In [None]:
# Find rows with problematic Location/Crop encoding
problematic_location_rows = all_data[all_data['Location_encoded'] >= num_location]
problematic_crop_rows = all_data[all_data['CropName_encoded'] >= num_crop]

print("Problematic Locations:\n", problematic_location_rows)
print("Problematic Crops:\n", problematic_crop_rows)


Problematic Locations:
 Empty DataFrame
Columns: [Location, Date, CropName, Price, Location_encoded, CropName_encoded, Day, Month, Year]
Index: []
Problematic Crops:
 Empty DataFrame
Columns: [Location, Date, CropName, Price, Location_encoded, CropName_encoded, Day, Month, Year]
Index: []


In [None]:
print("Checking for NaN values in Location_encoded and CropName_encoded...")
print(all_data['Location_encoded'].isna().sum(), "NaN values found in Location_encoded")
print(all_data['CropName_encoded'].isna().sum(), "NaN values found in CropName_encoded")


Checking for NaN values in Location_encoded and CropName_encoded...
0 NaN values found in Location_encoded
0 NaN values found in CropName_encoded


In [None]:
problematic_location_rows = all_data[all_data['Location_encoded'].isna()]
problematic_crop_rows = all_data[all_data['CropName_encoded'].isna()]

print("Rows with problematic Location encoding:")
print(problematic_location_rows)

print("Rows with problematic CropName encoding:")
print(problematic_crop_rows)


Rows with problematic Location encoding:
Empty DataFrame
Columns: [Location, Date, CropName, Price, Location_encoded, CropName_encoded, Day, Month, Year]
Index: []
Rows with problematic CropName encoding:
Empty DataFrame
Columns: [Location, Date, CropName, Price, Location_encoded, CropName_encoded, Day, Month, Year]
Index: []


In [None]:
# Check if any locations/crops are missing from the encoder
missing_locations = set(all_data['Location'].unique()) - set(location_encoder.classes_)
missing_crops = set(all_data['CropName'].unique()) - set(crop_encoder.classes_)

print(f"Missing locations: {missing_locations}")
print(f"Missing crops: {missing_crops}")


Missing locations: set()
Missing crops: set()


In [5]:
# Refit encoder to include all unique values
location_encoder.fit(all_data['Location'].unique())
crop_encoder.fit(all_data['CropName'].unique())

# Apply the updated encoding
all_data['Location_encoded'] = location_encoder.transform(all_data['Location'])
all_data['CropName_encoded'] = crop_encoder.transform(all_data['CropName'])


In [7]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
import os

# File paths for saving/loading sequences and labels
sequences_file = '/content/drive/MyDrive/FYP/sequences.pt'
labels_file = '/content/drive/MyDrive/FYP/labels.pt'

# Chunk-based sequence creation function
def create_sequences_in_batches(data, n_steps=30, batch_size=100000):
    sequences = []
    labels = []
    features = ['Location_encoded', 'CropName_encoded', 'Day', 'Month', 'Year', 'Price']

    # Convert data to NumPy array for faster processing
    data_values = data[features].values
    price_values = data['Price'].values

    total_rows = len(data) - n_steps

    # Process data in chunks
    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        chunk_sequences = []
        chunk_labels = []

        for i in range(start, end):
            seq_data = data_values[i:i + n_steps]  # Faster access using NumPy slicing
            label = price_values[i + n_steps]

            chunk_sequences.append(seq_data)
            chunk_labels.append(label)
        np.array(chunk_sequences)
        # Convert chunk to tensor and append to main list
        sequences.append(torch.Tensor(np.array(chunk_sequences)))
        labels.append(torch.Tensor(np.array(chunk_labels)))

        print(f"Processed rows from {start} to {end}")

    # Concatenate all batches
    return torch.cat(sequences), torch.cat(labels)

# Function to save the sequences and labels
def save_data(sequences, labels, seq_file_path, lbl_file_path):
    torch.save(sequences, seq_file_path)
    torch.save(labels, lbl_file_path)
    print(f"Data saved to {seq_file_path} and {lbl_file_path}")

# Function to load the sequences and labels if they already exist
def load_data(seq_file_path, lbl_file_path):
    if os.path.exists(seq_file_path) and os.path.exists(lbl_file_path):
        sequences = torch.load(seq_file_path)
        labels = torch.load(lbl_file_path)
        print(f"Data loaded from {seq_file_path} and {lbl_file_path}")
        return sequences, labels
    else:
        return None, None

# Check if the saved data exists
sequences, labels = load_data(sequences_file, labels_file)

if sequences is None or labels is None:
    print("No saved data found. Processing sequences and labels...")

    # Split your large data into smaller chunks and create sequences
    n_steps = 30
    batch_size = 1000000  # Adjust based on memory
    sequences, labels = create_sequences_in_batches(all_data, n_steps=n_steps, batch_size=batch_size)

    # Save the processed sequences and labels to disk
    save_data(sequences, labels, sequences_file, labels_file)
else:
    print("Using previously saved sequences and labels.")

# Create DataLoader
dataset = TensorDataset(sequences, labels)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)


No saved data found. Processing sequences and labels...
Processed rows from 0 to 1000000
Processed rows from 1000000 to 2000000
Processed rows from 2000000 to 3000000
Processed rows from 3000000 to 4000000
Processed rows from 4000000 to 5000000
Processed rows from 5000000 to 6000000
Processed rows from 6000000 to 7000000
Processed rows from 7000000 to 8000000
Processed rows from 8000000 to 9000000
Processed rows from 9000000 to 10000000
Processed rows from 10000000 to 11000000
Processed rows from 11000000 to 12000000
Processed rows from 12000000 to 13000000
Processed rows from 13000000 to 14000000
Processed rows from 14000000 to 15000000
Processed rows from 15000000 to 16000000
Processed rows from 16000000 to 17000000
Processed rows from 17000000 to 18000000
Processed rows from 18000000 to 19000000
Processed rows from 19000000 to 20000000
Processed rows from 20000000 to 21000000
Processed rows from 21000000 to 22000000
Processed rows from 22000000 to 23000000
Processed rows from 230000

In [8]:
import torch
import torch.nn as nn

class LSTMPricePredictor(nn.Module):
    def __init__(self, num_location, num_crop, hidden_size, num_layers, output_size):
        super(LSTMPricePredictor, self).__init__()

        # Embeddings for Location and CropName
        self.location_embedding = nn.Embedding(num_location, 10)  # Embedding size 10
        self.crop_embedding = nn.Embedding(num_crop, 10)  # Embedding size 10

        # LSTM
        input_size = 24  # Updated input size: 10 (location) + 10 (crop) + 4 (other features)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Embed the categorical features (location and crop)
        location_embedded = self.location_embedding(x[:, :, 0].long())  # Shape: [batch_size, sequence_len, 10]
        crop_embedded = self.crop_embedding(x[:, :, 1].long())  # Shape: [batch_size, sequence_len, 10]

        # Extract the other features (Day, Month, Year, and Price)
        other_features = x[:, :, 2:]  # Shape: [batch_size, sequence_len, 4]

        # Concatenate embeddings with other features along the feature dimension (dim=2)
        x = torch.cat((location_embedded, crop_embedded, other_features), dim=2)

        # LSTM forward pass
        h0 = torch.zeros(2, x.size(0), 128).to(x.device)  # Initial hidden state
        c0 = torch.zeros(2, x.size(0), 128).to(x.device)  # Initial cell state
        out, _ = self.lstm(x, (h0, c0))

        # Pass the output of the last time step to the fully connected layer
        out = self.fc(out[:, -1, :])  # Shape: [batch_size, output_size]
        return out

# Initialize the model
hidden_size = 128
num_layers = 2
output_size = 1  # Predicting price

model = LSTMPricePredictor(num_location=len(location_encoder.classes_),
                           num_crop=len(crop_encoder.classes_),
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           output_size=output_size)

# Loss and optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move to device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [None]:
from tqdm import tqdm

def train_model(model, data_loader, num_epochs):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        # Create a progress bar for the current epoch
        progress_bar = tqdm(data_loader, desc=f"Epoch [{epoch+1}/{num_epochs}]")

        for sequences, labels in progress_bar:
            sequences, labels = sequences.to(device), labels.to(device)
            sequences = sequences.float()  # Keep sequences as float, but no unsqueeze

            # Forward pass
            outputs = model(sequences)
            loss = loss_fn(outputs.squeeze(), labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Update the progress bar with the current loss
            progress_bar.set_postfix({'Loss': total_loss / len(data_loader)})

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(data_loader):.4f}')

        # Save the model after each epoch
        model_path = f'/content/drive/My Drive/FYP/lstm_price_model_epoch_{epoch+1}.pth'
        torch.save(model.state_dict(), model_path)

# Train for a few epochs and save progress
train_model(model, data_loader, num_epochs=3)

Epoch [1/3]:   2%|▏         | 6129/370677 [03:02<2:57:02, 34.32it/s, Loss=0.000106]

In [None]:
def predict_future(model, last_n_days_data, n_days_to_predict):
    model.eval()
    predictions = []

    with torch.no_grad():
        input_seq = torch.Tensor(last_n_days_data).unsqueeze(0).to(device)

        for _ in range(n_days_to_predict):
            output = model(input_seq)
            predictions.append(output.item())

            # Add the prediction to the sequence and remove the oldest day
            next_input = torch.cat((input_seq[:, 1:, :], output.unsqueeze(0).unsqueeze(0)), dim=1)
            input_seq = next_input

    return predictions

....