In [1]:
!pip3 install streamlit
!pip3 install pyngrok

Collecting streamlit
  Downloading streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.40.2-py2.py3-none-any.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[

In [2]:
%%%writefile content_based_app.py
import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import pickle

# Load your datasets
san_pham = pd.read_csv('San_pham.csv')
khach_hang = pd.read_csv('Khach_hang.csv')
danh_gia = pd.read_csv('Danh_gia.csv')

# Load stopwords
with open('vietnamese-stopwords.txt', 'r', encoding="utf8") as file:
    stopwords = file.read().split('\n')


# Data preprocessing and merging
join_san_pham_danh_gia = pd.merge(danh_gia, san_pham, on='ma_san_pham', how='left')
df_ori = pd.merge(join_san_pham_danh_gia, khach_hang, on='ma_khach_hang', how='left')
dataframe = df_ori.copy()

df = dataframe[['ma_san_pham', 'ten_san_pham', 'mo_ta', 'so_sao']]

# Load replacement words
correct_word_list = pd.read_csv('correct_word_list.csv')
replacement_dict = dict(zip(correct_word_list['original'], correct_word_list['replacement']))

# Function to clean and correct text
def clean_and_correct_text(text):
    # Clean text
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\d+', '', text)      # Remove numbers
        text = text.lower().strip()          # Lowercase and strip whitespace
        for key, value in replacement_dict.items():
            text = re.sub(r'\b' + re.escape(key) + r'\b', value, text)
        return text
    return ''

df['mo_ta'] = df['mo_ta'].apply(clean_and_correct_text)
df['ten_san_pham'] = df['ten_san_pham'].apply(clean_and_correct_text)

# Create combined content and tokenize
df['Content'] = df[['ten_san_pham', 'mo_ta']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
df["Content_wt"] = df["Content"].apply(lambda x: word_tokenize(x))

# function cần thiết

# Load cosine similarity matrix from the pickle file
with open('products_cosine_sim.pkl', 'rb') as f:
    cosine_sim = pickle.load(f)

# Recommendation function
def get_recommendations_cosine(sp_id_or_name, cosine_sim=cosine_sim, nums=10, min_rating=4):
    if isinstance(sp_id_or_name, int):
        if sp_id_or_name not in df['ma_san_pham'].values:
            return pd.DataFrame()
        idx = df.index[df['ma_san_pham'] == sp_id_or_name][0]
    else:
        matching_products = df[df['ten_san_pham'].str.contains(sp_id_or_name, case=False, na=False) |
                               df['mo_ta'].str.contains(sp_id_or_name, case=False, na=False)]
        if matching_products.empty:
            return pd.DataFrame()
        idx = matching_products.index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:nums + 1]

    sp_indices = [i[0] for i in sim_scores]
    recommended_products = df[['ma_san_pham', 'ten_san_pham', 'mo_ta', 'so_sao']].iloc[sp_indices]
    recommended_products = recommended_products[recommended_products['so_sao'] >= min_rating]
    recommended_products = recommended_products.drop_duplicates(subset='ma_san_pham').sort_values(by='so_sao', ascending=False)

    return recommended_products

# # Lấy 10 sản phẩm
# random_products = df.head(n=10)

# # Hiển thị đề xuất ra bảng
# def display_recommended_products(recommended_products, cols=5):
#     for i in range(0, len(recommended_products), cols):
#         cols = st.columns(cols)
#         for j, col in enumerate(cols):
#             if i + j < len(recommended_products):
#                 product = recommended_products.iloc[i + j]
#                 with col:
#                     st.write(product['ten_san_pham'])
#                     expander = st.expander(f"Mô tả")
#                     product_description = product['mo_ta']
#                     truncated_description = ' '.join(product_description.split()[:100]) + '...'
#                     expander.write(truncated_description)
#                     expander.markdown("Nhấn vào mũi tên để đóng hộp text này.")


# st.session_state.random_products = random_products

st.title("Data Science Project")
st.write("##")

menu = ["Yêu cầu bài toán", "Xây dựng model", "Gợi ý cho người dùng"]
choice = st.sidebar.selectbox('Menu', menu)
st.sidebar.write("""#### Thành viên thực hiện:
                 1) CHẾ THỊ ÁNH TUYỀN
                 2) NGUYỄN CHẤN NAM""")
st.sidebar.write("""#### Giảng viên hướng dẫn: """)
st.sidebar.write("""#### Ngày báo cáo đồ án: 12/2024""")
if choice == 'Yêu cầu bài toán':
    st.subheader("Yêu cầu bài toán")
    st.write("""
    ###### Classifying spam and ham messages is one of the most common natural language processing tasks for emails and chat engines. With the advancements in machine learning and natural language processing techniques, it is now possible to separate spam messages from ham messages with a high degree of accuracy.
    """)
    st.write("""###### => Problem/ Requirement: Use Machine Learning algorithms in Python for ham and spam message classification.""")
    # st.image("ham_spam.jpg")

elif choice == 'Xây dựng model':
    st.subheader("Xây dựng model")
    st.write("##### 1. Some data")
    # st.dataframe(data[['v2', 'v1']].head(3))
    # st.dataframe(data[['v2', 'v1']].tail(3))
    # st.write("##### 2. Visualize Ham and Spam")
    # fig1 = sns.countplot(data=data[['v1']], x='v1')
    # st.pyplot(fig1.figure)

    # st.write("##### 3. Xây dựng model...")
    # st.write("##### 4. Evaluation")
    # st.code("Score train:"+ str(round(score_train,2)) + " vs Score test:" + str(round(score_test,2)))
    # st.code("Accuracy:"+str(round(acc,2)))
    # st.write("###### Confusion matrix:")
    # st.code(cm)
    # st.write("###### Classification report:")
    # st.code(cr)
    # st.code("Roc AUC score:" + str(round(roc,2)))

    # # calculate roc curve
    # st.write("###### ROC curve")
    # fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1])
    # fig, ax = plt.subplots()
    # ax.plot([0, 1], [0, 1], linestyle='--')
    # ax.plot(fpr, tpr, marker='.')
    # st.pyplot(fig)

    # st.write("##### 5. Summary: This model is good enough for Ham vs Spam classification.")

elif choice == 'Gợi ý cho người dùng':
    # Streamlit UI
    st.image('hasaki_banner.jpg')
    st.title('Product Recommendation System')
    # Display first 10 products
    st.subheader('Products List')
    st.dataframe(san_pham.head(10),)  # Display the first 10 products

    user_input = st.text_input("Enter product ID or name:")
    if st.button('Get Recommendations'):
        try:
            user_input_int = int(user_input)
            recommendations = get_recommendations_cosine(user_input_int)
        except ValueError:
            recommendations = get_recommendations_cosine(user_input)
            if not recommendations.empty:
                st.write("Recommendations:")
                st.dataframe(recommendations)
            else:
                st.write("No recommendations available.")


    # # Kiểm tra xem 'selected_ma_san_pham' đã có trong session_state hay chưa
    # if 'selected_ma_san_pham' not in st.session_state:
    #     # Nếu chưa có, thiết lập giá trị mặc định là None hoặc ID sản phẩm đầu tiên
    #     st.session_state.selected_ma_san_pham = None

    # # Theo cách cho người dùng chọn sản phẩm từ dropdown
    # # Tạo một tuple cho mỗi sản phẩm, trong đó phần tử đầu là tên và phần tử thứ hai là ID
    # product_options = [(row['ten_san_pham'], row['ma_san_pham']) for index, row in st.session_state.random_products.iterrows()]
    # st.session_state.random_products
    # # Tạo một dropdown với options là các tuple này
    # selected_product = st.selectbox(
    #     "Chọn sản phẩm",
    #     options=product_options,
    #     format_func=lambda x: x[0]  # Hiển thị tên sản phẩm
    # )
    # # Display the selected product
    # st.write("Bạn đã chọn:", selected_product)

    # # Cập nhật session_state dựa trên lựa chọn hiện tại
    # st.session_state.selected_ma_san_pham = selected_product[1]

    # if st.session_state.selected_ma_san_pham:
    #     st.write("ma_san_pham: ", st.session_state.selected_ma_san_pham)
    #     # Hiển thị thông tin sản phẩm được chọn
    #     selected_product = df[df['ma_san_pham'] == st.session_state.selected_ma_san_pham]

    #     if not selected_product.empty:
    #         st.write('#### Bạn vừa chọn:')
    #         st.write('### ', selected_product['ten_san_pham'].values[0])

    #         product_description = selected_product['mo_ta'].values[0]
    #         truncated_description = ' '.join(product_description.split()[:100])
    #         st.write('##### Thông tin:')
    #         st.write(truncated_description, '...')

    #         st.write('##### Các sản phẩm liên quan:')
    #         recommendations = get_recommendations_cosine(df, st.session_state.selected_ma_san_pham, cosine_sim=cosine_sim_new, nums=3)
    #         display_recommended_products(recommendations, cols=3)
    #     else:
    #         st.write(f"Không tìm thấy sản phẩm với ID: {st.session_state.selected_ma_san_pham}")


UsageError: Cell magic `%%%writefile` not found.


In [3]:
from pyngrok import ngrok

In [4]:
ngrok.set_auth_token("2pew2iJ1BvxuDjF1GQGsDExfbKo_4CimNeGDSWRK2PqR69hYb")



In [6]:
# Start Streamlit server on a specific port
! nohup streamlit run app.py --server.port 8501 &
# Start ngrok tunnel to expose the Streamlit server
ngrok_tunnel = ngrok.connect(addr='8501', proto='http', bind_tls=True)
# Print the URL of the ngrok tunnel
print(' * Tunnel URL:', ngrok tunnel.public url)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (<ipython-input-6-95878e5e7047>, line 6)

In [None]:
# ngrok.kill()