<a href="https://colab.research.google.com/github/mariamffatima/Smart-Energy-Consumption-Recommender-using-FAISS/blob/LLM-deepseek/LLM_Deepseek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faiss-cpu



In [2]:
!pip install -U transformers==4.31.0 datasets==2.14.4 peft==0.4.0



In [3]:
!pip install -q transformers accelerate bitsandbytes peft

In [5]:
!pip install -q faiss-cpu transformers datasets peft bitsandbytes accelerate sentence-transformers

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import faiss
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import os
os.environ["WANDB_DISABLED"] = "true"

# Load and preprocess data
url = "https://huggingface.co/datasets/panda04/smart-home-dataset/raw/main/smart_home_dataset.csv"
data = pd.read_csv(url)

def preprocess_data(df):
    df = df.drop(['Transaction_ID', 'Unix Timestamp'], axis=1)
    df['is_peak_hour'] = df['Hour of the Day'].apply(lambda x: 1 if (6 <= x <= 9) or (18 <= x <= 21) else 0)
    df['part_of_day'] = pd.cut(df['Hour of the Day'], bins=[0, 6, 12, 18, 24],
                              labels=['night', 'morning', 'afternoon', 'evening'])
    df['is_weekend'] = df['Day of the Week'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)
    season_dict = {
        'December': 'Winter', 'January': 'Winter', 'February': 'Winter',
        'March': 'Spring', 'April': 'Spring', 'May': 'Spring',
        'June': 'Summer', 'July': 'Summer', 'August': 'Summer',
        'September': 'Fall', 'October': 'Fall', 'November': 'Fall'
    }
    df['Season'] = df['Month'].map(season_dict)
    appliances = ['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave']
    df['total_appliance_usage'] = df[appliances].sum(axis=1)
    consumption_threshold = df['Energy Consumption (kWh)'].quantile(0.75)
    df['is_high_consumption'] = df['Energy Consumption (kWh)'].apply(lambda x: 1 if x > consumption_threshold else 0)
    return df

data = preprocess_data(data)
train_df, temp_df = train_test_split(data, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

def generate_text_description(row):
    appliances_status = ", ".join(f"{appliance}: {'ON' if row[appliance] > 0 else 'OFF'}"
                                for appliance in ['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave'])
    return (f"At {row['Hour of the Day']}:00 during {row['Season']} {row['part_of_day']}, "
            f"appliances: {appliances_status}. Energy: {row['Energy Consumption (kWh)']:.2f}kWh")

for df in [train_df, val_df, test_df]:
    df['text_description'] = df.apply(generate_text_description, axis=1)
    df['label_text'] = df['is_high_consumption'].apply(lambda x: "High energy consumption" if x == 1 else "Normal energy consumption")

# FAISS setup
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
train_embeddings = embedding_model.encode(train_df['text_description'].tolist())
index = faiss.IndexFlatL2(train_embeddings.shape[1])
index.add(train_embeddings)
faiss.write_index(index, "faiss_index.bin")
train_df.to_pickle("train_df.pkl")

def augment_text(text, embedding, index, df_train, k=3):
    distances, indices = index.search(np.array([embedding]), k)
    contexts = [df_train.iloc[i]['text_description'] for i in indices[0]]
    return "\n\n".join(contexts) + "\n\n" + text

def prepare_augmented_dataset(df, embedding_model, index, df_train):
    embeddings = embedding_model.encode(df['text_description'].tolist())
    df['augmented_text'] = [augment_text(row.text_description, emb, index, df_train)
                           for emb, row in zip(embeddings, df.itertuples(index=False))]
    return df

train_df = prepare_augmented_dataset(train_df, embedding_model, index, train_df)
val_df = prepare_augmented_dataset(val_df, embedding_model, index, train_df)

# Model setup
model_name = "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, peft_config)

# Training
train_dataset = Dataset.from_pandas(train_df[['augmented_text', 'label_text']])
val_dataset = Dataset.from_pandas(val_df[['augmented_text', 'label_text']])

def tokenize_function(examples):
    prompts = [f"{text}\n\n### Answer:\n{label}" for text, label in zip(examples["augmented_text"], examples["label_text"])]
    tokenized = tokenizer(prompts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./deepseek-finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    gradient_accumulation_steps=4,
    eval_strategy="epoch",  # Changed to eval_strategy
    save_strategy="epoch",
    report_to="none",
    logging_dir="./logs",
    logging_steps=10,
    warmup_ratio=0.1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()
model.save_pretrained("./deepseek-finetuned-final")
tokenizer.save_pretrained("./deepseek-finetuned-final")

KeyboardInterrupt: 

In [None]:
!pip install -q faiss-cpu sentence-transformers

import pandas as pd
import torch
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from sentence_transformers import SentenceTransformer

# Load saved components
tokenizer = AutoTokenizer.from_pretrained("./deepseek-finetuned-final")
base_model = AutoModelForCausalLM.from_pretrained(
    "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base",
    device_map="auto",
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, "./deepseek-finetuned-final")
model = model.merge_and_unload()

index = faiss.read_index("faiss_index.bin")
train_df = pd.read_pickle("train_df.pkl")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_recommendation(input_data):
    # Create description
    appliances = ['Television', 'Dryer', 'Oven', 'Refrigerator', 'Microwave']
    appliances_status = ", ".join(f"{appliance}: {'ON' if input_data[appliance] > 0 else 'OFF'}" for appliance in appliances)
    input_text = (f"At {input_data['Hour of the Day']}:00 during {input_data['Season']} {input_data['part_of_day']}, "
                f"appliances: {appliances_status}. Energy: {input_data['Energy Consumption (kWh)']:.2f}kWh")

    # Retrieve context
    embedding = embedding_model.encode([input_text])[0]
    distances, indices = index.search(np.array([embedding]), 3)
    contexts = [train_df.iloc[i]['text_description'] for i in indices[0]]

    # Generate response
    prompt = "\n\n".join(contexts) + f"\n\n{input_text}\n\n### Answer:\n"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        do_sample=True
    )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return full_response.split("### Answer:")[-1].strip()

# Example test
test_case = {
    'Hour of the Day': 19,
    'Season': 'Winter',
    'part_of_day': 'evening',
    'Television': 1,
    'Dryer': 0,
    'Oven': 1,
    'Refrigerator': 1,
    'Microwave': 0,
    'Energy Consumption (kWh)': 5.8
}

recommendation = generate_recommendation(test_case)
print("Energy Recommendation:")
print(recommendation)

# For custom input:
# custom_input = { ... }  # Create your own input dictionary
# print(generate_recommendation(custom_input))