<a href="https://colab.research.google.com/github/mariamffatima/Smart-Energy-Consumption-Recommender-using-FAISS/blob/LLM-deepseek/LLM_Deepseek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faiss-cpu



In [2]:
from IPython import get_ipython
from IPython.display import display
# %%
!pip install faiss-cpu
# %%
!pip install -q transformers accelerate bitsandbytes sentence-transformers datasets==3.4.1
# Install peft from Hugging Face main branch - often required for unsloth from git
!pip install git+https://github.com/huggingface/peft.git
# Install unsloth
!pip install unsloth[colab-new]@git+https://github.com/unslothai/unsloth.git

import unsloth
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import faiss
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, PeftModel # Import peft after installing it from git
from datasets import Dataset
import os


os.environ["WANDB_DISABLED"] = "true"

# Load and preprocess data
url = "https://huggingface.co/datasets/panda04/smart-home-dataset/raw/main/smart_home_dataset.csv"
data = pd.read_csv(url)

def preprocess_data(df):
    df = df.drop(['Transaction_ID', 'Unix Timestamp'], axis=1)
    df['is_peak_hour'] = df['Hour of the Day'].apply(lambda x: 1 if (6 <= x <= 9) or (18 <= x <= 21) else 0)
    df['part_of_day'] = pd.cut(df['Hour of the Day'], bins=[0, 6, 12, 18, 24], labels=['night', 'morning', 'afternoon', 'evening'])
    df['is_weekend'] = df['Day of the Week'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)
    df['Season'] = df['Month'].map({'December': 'Winter', 'January': 'Winter', 'February': 'Winter', 'March': 'Spring', 'April': 'Spring', 'May': 'Spring', 'June': 'Summer', 'July': 'Summer', 'August': 'Summer', 'September': 'Fall', 'October': 'Fall', 'November': 'Fall'})
    df['total_appliance_usage'] = df[['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave']].sum(axis=1)
    df['is_high_consumption'] = df['Energy Consumption (kWh)'].apply(lambda x: 1 if x > df['Energy Consumption (kWh)'].quantile(0.75) else 0)
    return df

data = preprocess_data(data)
train_df, temp_df = train_test_split(data, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Ensure 'text_description' column exists in train_df
if 'text_description' not in train_df.columns:
    # Create a simple text description if it doesn't exist for embedding
    # This is a placeholder and might need to be adjusted based on your data
    # Assuming you want to describe the state of the home based on the columns
    train_df['text_description'] = train_df.apply(lambda row: f"Hour: {row['Hour of the Day']}, Day: {row['Day of the Week']}, Month: {row['Month']}, Television: {row['Television']}, Dryer: {row['Dryer']}, Oven: {row['Oven']}, Refrigerator: {row['Refrigerator']}, Microwave: {row['Microwave']}, Energy Consumption: {row['Energy Consumption (kWh)']:.2f} kWh", axis=1)


train_embeddings = embedding_model.encode(train_df['text_description'].tolist())
index = faiss.IndexFlatL2(train_embeddings.shape[1])
index.add(train_embeddings)

faiss.write_index(index, "faiss_index.bin")
train_df.to_pickle("train_df.pkl")

# Model setup with Unsloth for optimized training
model_name = "deepseek-ai/deepseek-llm-7b-base" # Changed from the full URL
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, peft_config)

# Check if 'text_description' is in train_df before creating dataset
if 'text_description' not in train_df.columns:
     # Create a simple text description if it doesn't exist for training
     # This is a placeholder and might need to be adjusted based on your data
     # Assuming you want to describe the state of the home based on the columns
     train_df['text_description'] = train_df.apply(lambda row: f"Hour: {row['Hour of the Day']}, Day: {row['Day of the Week']}, Month: {row['Month']}, Television: {row['Television']}, Dryer: {row['Dryer']}, Oven: {row['Oven']}, Refrigerator: {row['Refrigerator']}, Microwave: {row['Microwave']}, Energy Consumption: {row['Energy Consumption (kWh)']:.2f} kWh", axis=1)


train_dataset = Dataset.from_pandas(train_df[['text_description', 'is_high_consumption']])


Collecting git+https://github.com/huggingface/peft.git
  Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-d6qyfqan
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-d6qyfqan
  Resolved https://github.com/huggingface/peft.git to commit 8af29c646860e617b641225caf7ef47f7c3dcd26
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-h2tz4xs3/unsloth_ad996163d30c4e49a932ba7593dfb7df
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-h2tz4xs3/unsloth_ad996163d30c4e49a932ba7593dfb7df
  Resolved https://github.com/unslothai/unsloth

tokenizer_config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [None]:
# Tokenization and Training Setup
def tokenize_function(examples):
    # Combine input text and the target output text
    combined_texts = []
    for i in range(len(examples['text_description'])):
        input_text = examples['text_description'][i]
        label_text = "High energy consumption" if examples['is_high_consumption'][i] else "Normal energy consumption"
        # You might want to use a specific separator or format here
        combined_texts.append(f"{input_text} ### Output: {label_text}{tokenizer.eos_token}")

    # Tokenize the combined texts
    tokenized = tokenizer(
        combined_texts,
        padding="max_length",
        truncation=True,
        max_length=256, # Keep a reasonable max_length for the combined sequence
        return_tensors="pt" # Return PyTorch tensors
    )

    # Shift the input IDs to create labels for causal language modeling
    labels = tokenized["input_ids"].clone()
    # In causal language modeling, the labels are the input IDs shifted by one position
    # We also set the padding token id to -100 so it's ignored in the loss calculation
    labels[labels == tokenizer.pad_token_id] = -100
    # Shift the labels by one position to the left for next token prediction
    labels = torch.cat([labels[:, 1:], torch.full((labels.shape[0], 1), -100, dtype=torch.long)], dim=1)


    tokenized["labels"] = labels
    return tokenized

# Apply the modified tokenization function
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text_description', 'is_high_consumption'])


training_args = TrainingArguments(
    output_dir="./deepseek-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-5,
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

trainer.train()
model.save_pretrained("./deepseek-finetuned-final")
tokenizer.save_pretrained("./deepseek-finetuned-final")

Map:   0%|          | 0/29383 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,1.6206


In [None]:

# Recommendation Function
def generate_recommendation(input_data):
    input_text = f"At {input_data['Hour of the Day']} during {input_data['Season']} {input_data['part_of_day']}, with appliances status: {', '.join([f'{appliance}: {input_data[appliance]}' for appliance in ['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave']])}."
    embedding = embedding_model.encode([input_text])[0]
    distances, indices = index.search(np.array([embedding]), 3)
    # Ensure 'text_description' is in train_df when retrieving contexts
    if 'text_description' not in train_df.columns:
         train_df['text_description'] = train_df.apply(lambda row: f"Hour: {row['Hour of the Day']}, Day: {row['Day of the Week']}, Month: {row['Month']}, Television: {row['Television']}, Dryer: {row['Dryer']}, Oven: {row['Oven']}, Refrigerator: {row['Refrigerator']}, Microwave: {row['Microwave']}, Energy Consumption: {row['Energy Consumption (kWh)']:.2f} kWh", axis=1)

    contexts = [train_df.iloc[i]['text_description'] for i in indices[0]]

    prompt = "\n\n".join(contexts) + f"\n\n{input_text}\n\n### Answer:\n"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(model.device)

    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7, do_sample=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Answer:")[-1].strip()

# Example Usage
example_input = {"Hour of the Day": 19, "Season": "Winter", "part_of_day": "evening", "Television": 1, "Dryer": 0, "Oven": 1, "Refrigerator": 1, "Microwave": 0}
print("Energy Recommendation:", generate_recommendation(example_input))