## AI/ML Internship Assignment

### Objective:

#### Develop a Deep Learning Model for predicting user purchase history based onhistorical data.
 

### Import Libraries

In [13]:
import random  ### Neccessories libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

### Data Simulation

In [2]:
## Data preperation
num_users = 100
min_sequences_per_user = 5
max_sequences_per_user = 10
min_purchases_per_sequence = 3
max_purchases_per_sequence = 5
platforms = ["Amazon Shopping",
  "Flipkart",
  "Amazon Prime Membership",  
  "Zomato",
  "Myntra",
  "Hotstar",
  "McDonald's",
  "Pizza Hut",
  "Ola Cab",
  "PVR Cinemas",
  "Reliance Digital",
  "KFC",
  "Makemytrip",
  "Uber",
  "Cafe Coffee Day",
  "Blinkit",
  "Swiggy",
  "Ajio",
  "Dominos",
  "Wildcraft"]

In [3]:
data = []
for user_id in range(1, num_users + 1):
    num_sequences = random.randint(min_sequences_per_user, max_sequences_per_user)
    for _ in range(num_sequences):
        num_purchases = random.randint(min_purchases_per_sequence, max_purchases_per_sequence)
        purchases = random.choices(platforms, k=num_purchases)
        data.append({'user': f'user{user_id}', 'purchases': ', '.join(purchases)})

print(data)

[{'user': 'user1', 'purchases': 'Hotstar, Makemytrip, KFC, Pizza Hut'}, {'user': 'user1', 'purchases': 'Ola Cab, Wildcraft, Pizza Hut'}, {'user': 'user1', 'purchases': 'Dominos, Dominos, Swiggy'}, {'user': 'user1', 'purchases': 'PVR Cinemas, Blinkit, Blinkit, Flipkart, Amazon Shopping'}, {'user': 'user1', 'purchases': "Ajio, McDonald's, Uber, Makemytrip, McDonald's"}, {'user': 'user2', 'purchases': 'Myntra, Swiggy, Hotstar'}, {'user': 'user2', 'purchases': 'PVR Cinemas, Amazon Prime Membership, Swiggy'}, {'user': 'user2', 'purchases': 'Swiggy, Wildcraft, Uber, Amazon Prime Membership, Zomato'}, {'user': 'user2', 'purchases': 'Swiggy, PVR Cinemas, Zomato'}, {'user': 'user2', 'purchases': 'Pizza Hut, Wildcraft, Myntra'}, {'user': 'user2', 'purchases': 'Amazon Shopping, PVR Cinemas, Amazon Prime Membership'}, {'user': 'user2', 'purchases': "PVR Cinemas, Cafe Coffee Day, McDonald's"}, {'user': 'user2', 'purchases': 'Flipkart, Pizza Hut, KFC, Amazon Prime Membership'}, {'user': 'user3', 'pu

In [4]:
# convert data into datafframe
df= pd.DataFrame(data)
df

Unnamed: 0,user,purchases
0,user1,"Hotstar, Makemytrip, KFC, Pizza Hut"
1,user1,"Ola Cab, Wildcraft, Pizza Hut"
2,user1,"Dominos, Dominos, Swiggy"
3,user1,"PVR Cinemas, Blinkit, Blinkit, Flipkart, Amazo..."
4,user1,"Ajio, McDonald's, Uber, Makemytrip, McDonald's"
...,...,...
752,user100,"Ola Cab, Zomato, Uber, Uber, Swiggy"
753,user100,"Cafe Coffee Day, McDonald's, Myntra, Amazon Sh..."
754,user100,"Makemytrip, Pizza Hut, Hotstar"
755,user100,"Uber, Makemytrip, Reliance Digital, McDonald's..."


 ### Text preprocessing

In [5]:
 #Text preprocessing
import re
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['purchases'] = df['purchases'].apply(preprocess_text)


In [6]:
# Tokenize the purchases
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['purchases'])
sequences = tokenizer.texts_to_sequences(df['purchases'])

In [8]:
sequences

[[16, 7, 2, 22],
 [8, 9, 7, 1, 3, 2, 11],
 [22, 12, 2, 16, 7],
 [1, 27, 28, 4, 1, 3, 11],
 [11, 11, 2],
 [5, 6, 13, 2, 12],
 [20, 21, 16, 13, 20, 21],
 [10, 12, 7, 1, 27, 28, 16],
 [23, 13, 14],
 [15, 4, 7, 1, 3],
 [12, 15, 20, 21, 16, 1, 27, 28],
 [5, 6, 25, 26, 1, 27, 28, 4],
 [16, 7, 2, 10],
 [20, 21, 10, 22],
 [16, 13, 13, 7],
 [12, 25, 26, 7, 20, 21],
 [17, 18, 19, 8, 9, 1, 3, 13, 16],
 [2, 17, 18, 19, 7, 22],
 [2, 13, 14, 14],
 [16, 23, 17, 18, 19],
 [16, 1, 3, 25, 26, 8, 9, 11],
 [10, 17, 18, 19, 22, 14],
 [1, 27, 28, 4, 22],
 [23, 25, 26, 1, 3, 5, 6, 14],
 [17, 18, 19, 13, 1, 27, 28],
 [14, 15, 8, 9],
 [13, 5, 6, 12],
 [10, 2, 10, 17, 18, 19],
 [20, 21, 23, 23, 17, 18, 19],
 [5, 6, 5, 6, 8, 9],
 [24, 17, 18, 19, 24, 5, 6, 22],
 [15, 16, 22],
 [7, 7, 16],
 [2, 25, 26, 4],
 [10, 20, 21, 12],
 [15, 14, 10, 4],
 [8, 9, 17, 18, 19, 8, 9, 23],
 [8, 9, 25, 26, 5, 6],
 [4, 12, 16],
 [14, 7, 12, 25, 26],
 [24, 20, 21, 13, 17, 18, 19],
 [13, 7, 22, 4],
 [13, 5, 6, 14, 11, 5, 6],
 [17, 18

In [7]:
### Adding padding
padded_sequences = pad_sequences(sequences, padding='post')

In [8]:
padded_sequences.shape

(757, 12)

In [10]:
## Input features anf label
X = []
y = []
for seq in padded_sequences:
    for i in range(1, len(seq)):
        X.append(seq[:i])
        y.append(seq[i])

In [14]:
X = pad_sequences(X, padding='post')
y = np.array(y)


In [15]:
X

array([[ 2,  0,  0, ...,  0,  0,  0],
       [ 2,  5,  0, ...,  0,  0,  0],
       [ 2,  5,  8, ...,  0,  0,  0],
       ...,
       [28, 19, 12, ...,  0,  0,  0],
       [28, 19, 12, ...,  0,  0,  0],
       [28, 19, 12, ...,  0,  0,  0]])

In [16]:
y

array([ 5,  8, 23, ...,  0,  0,  0])

### Data splitinig

In [17]:
## Spliting data into trainin,testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)


In [18]:
X_test


array([[28, 22,  0, ...,  0,  0,  0],
       [28,  4,  0, ...,  0,  0,  0],
       [17, 18, 14, ...,  0,  0,  0],
       ...,
       [ 3, 12,  0, ...,  0,  0,  0],
       [17,  4, 16, ...,  0,  0,  0],
       [ 8,  1,  6, ...,  0,  0,  0]])

### Model Building

In [19]:
## Model building

vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=X.shape[1]))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))




In [20]:
## model comilation
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])


### Model trainig

In [21]:
# Train the model
model.fit(X_train, y_train, epochs=20, validation_split=0.2)


Epoch 1/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 66ms/step - accuracy: 0.5472 - loss: 2.4940 - val_accuracy: 0.5469 - val_loss: 2.0814
Epoch 2/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 38ms/step - accuracy: 0.5764 - loss: 1.8679 - val_accuracy: 0.5761 - val_loss: 1.8404
Epoch 3/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 0.5958 - loss: 1.7526 - val_accuracy: 0.5626 - val_loss: 1.8277
Epoch 4/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.5826 - loss: 1.7711 - val_accuracy: 0.5829 - val_loss: 1.7560
Epoch 5/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 61ms/step - accuracy: 0.6066 - loss: 1.6412 - val_accuracy: 0.6159 - val_loss: 1.3889
Epoch 6/20
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.6438 - loss: 1.2893 - val_accuracy: 0.6302 - val_loss: 1.3267
Epoch 7/20
[1m167

<keras.src.callbacks.history.History at 0x1ebe0f5b610>

### Model performance

In [22]:
# Model accuracy
loss,accuracy = model.evaluate(X_test,y_test)
print(loss)
print(accuracy)


[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.7376 - loss: 1.0397
1.0786100625991821
0.7292917370796204


### Try with random variable

In [None]:
last_purchases ='Uber, Makemytrip, McDonald'
input_seq = tokenizer.texts_to_sequences([last_purchases])
input_seq = pad_sequences(input_seq, maxlen=X.shape[1], padding='post')
predicted_probs = model.predict(input_seq)

# Exclude the padding token (index 0) from predictions
predicted_probs[0][0] = 0
predicted = np.argmax(predicted_probs, axis=-1)


if predicted[0] in tokenizer.index_word:
    next_purchase = tokenizer.index_word[predicted[0]]
else:
    next_purchase = 'Unknown'

print(f"The next purchase could be: {next_purchase}")

print(f"Predicted index: {predicted[0]}")
#print(f"Word index: {tokenizer.index_word}"

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
The next purchase could be: amazon
Predicted index: 1


### predict the next purchase given a sequence of previous purchases

In [25]:
# Function to predict the next purchase given a sequence of previous purchases

def predict_next_purchase(purchase_sequence):
    # Tokenize and pad the input sequence
    input_seq = tokenizer.texts_to_sequences([purchase_sequence])
    input_seq = pad_sequences(input_seq, maxlen=X.shape[1], padding='post')
    # Predict the probabilities for the next purchase
    predicted_probs = model.predict(input_seq)
    # Exclude the padding token (index 0) from predictions
    predicted_probs[0][0] = 0
    # Get the index of the highest probability
    predicted = np.argmax(predicted_probs, axis=-1)
    # Convert the index to the corresponding purchase
    if predicted[0] in tokenizer.index_word:
        next_purchase = tokenizer.index_word[predicted[0]]
    else:
        next_purchase = 'Unknown'
    return next_purchase

# Example usage:
last_purchases = 'Uber, Makemytrip, Myntra'
next_purchase = predict_next_purchase(last_purchases)
print(f"The next purchase could be: {next_purchase}")

# Test the model with multiple sequences from the dataset
for i in range(5):
    sample_sequence = ', '.join(random.choice(df['purchases']))
    next_purchase = predict_next_purchase(sample_sequence)
    print(f"Given sequence: {sample_sequence}")
    print(f"Predicted next purchase: {next_purchase}")
    print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
The next purchase could be: amazon
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 892ms/step
Given sequence: m, y, n, t, r, a,  , r, e, l, i, a, n, c, e,  , d, i, g, i, t, a, l,  , a, m, a, z, o, n,  , p, r, i, m, e,  , m, e, m, b, e, r, s, h, i, p
Predicted next purchase: digital

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Given sequence: a, j, i, o,  , a, m, a, z, o, n,  , p, r, i, m, e,  , m, e, m, b, e, r, s, h, i, p,  , k, f, c,  , p, v, r,  , c, i, n, e, m, a, s
Predicted next purchase: digital

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
Given sequence: p, v, r,  , c, i, n, e, m, a, s,  , a, j, i, o,  , k, f, c,  , f, l, i, p, k, a, r, t
Predicted next purchase: digital

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Given sequence: u, b, e, r,  , b, l, i, n, k, i, t,  , a, m, a, z, o, n,  , p, r, i, m, e,  , m, e,