In [20]:
import os, glob, sqlite3
import numpy
import pandas
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [21]:
os.makedirs("Google Play Store Dataset", exist_ok=True)
Dataset_Path = os.path.join("Google Play Store Dataset", "Google Play Store Dataset.csv")

In [22]:
Store = pandas.read_csv(Dataset_Path)
SQLite3_Dataset = Store

In [23]:
class SQLite3:
    def __init__(self, Dataset , Name):
        self.Dataset = Dataset
        self.Name = Name
        
    def SQLite3_Connection(self):
        if self.Name + ".db" not in glob.glob("*db"):
            Connection = sqlite3.connect(self.Name + ".db")
            self.Dataset.to_sql(self.Name, Connection, if_exists='replace', index=False)
            print("Database already created!")
            return Connection
        else: return sqlite3.connect(self.Name + ".db")

    def Exc(self, Query):
        Connection = self.SQLite3_Connection()
        return pandas.read_sql_query(Query, Connection)
    
Conn = SQLite3(SQLite3_Dataset, 'Google Play Store Application')
Query = """
    SELECT * 
    FROM 'Google Play Store Application'
"""        
# Conn.Exc(Query)

In [24]:
class Dataset_Cleanning:
    def __init__(self, Dataset):
        self.Dataset = Dataset.copy().dropna()
        self.Cleaned_Dataset = self.Data_cleaning()
    
    def Data_cleaning(self):
        columns = ['Category', 'Reviews', 'Installs', 'Type', 'Content Rating', 'Genres']
        for Column_Name in self.Dataset.columns:
            if Column_Name in columns:
                try:
                    self.Dataset[Column_Name] = self.Dataset[Column_Name].astype(str) 
                    Column_Cleaning = {Values: Index for Index, Values in enumerate(self.Dataset[Column_Name].unique())}
                    self.Dataset.loc[:, Column_Name] = self.Dataset[Column_Name].map(Column_Cleaning)                    
                except ValueError:
                    pass
                
        self.Dataset['Price'] = self.Dataset['Price'].apply(lambda x: 0 if x == 'Free' else float(x.strip('$')))         
        self.Cleaned_Dataset = self.Dataset
        return self.Cleaned_Dataset
    

In [30]:
class App_Trend_Model:
    def __init__(self, Dataset):
        cleaned_data = Dataset_Cleanning(Dataset).Cleaned_Dataset
        self.X_train, self.X_test, self.Y_train, self.Y_test = self.Data_Preparation(cleaned_data)
        self.Model = self.Build_Model()

    def Data_Preparation(self, Dataset):
        X = Dataset.drop(['Category', 'Reviews', 'Type', 'Content Rating', 'Genres', 'Price'], axis=1)  # Các feature
        Y = Dataset['Installs'].astype(float)
        
        # Xử lý dữ liệu phân loại (Category, Genres, Type, etc.)
        X = pandas.get_dummies(X, drop_first=True)  

        # Chuẩn hóa dữ liệu đầu vào
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        # Định hình lại dữ liệu cho LSTM (samples, timesteps, features)
        X = X.reshape((X.shape[0], 1, X.shape[1]))

        # Chia tập train/test
        return train_test_split(X, Y, test_size=0.2, random_state=42)

    def Build_Model(self):
        # Xây dựng mô hình LSTM
        model = Sequential()
        model.add(LSTM(64, input_shape=(1, self.X_train.shape[2]), return_sequences=True))
        model.add(Dropout(0.5))
        model.add(LSTM(32))
        model.add(Dense(1, activation='linear'))

        model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
        return model

    def Train_Model(self, epochs=10, batch_size=32):
        # Huấn luyện mô hình
        history = self.Model.fit(
            self.X_train, self.Y_train,
            validation_data=(self.X_test, self.Y_test),
            epochs=epochs,
            batch_size=batch_size
        )

        # Đánh giá mô hình
        test_loss, test_mae = self.Model.evaluate(self.X_test, self.Y_test)
        print(f"MAE trên tập kiểm tra: {test_mae:.2f}")
        return history

model = App_Trend_Model(Store)
model.Train_Model(epochs=20, batch_size=32)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MAE trên tập kiểm tra: 2.58


<keras.callbacks.History at 0x200d916fdf0>