In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# 加载数据
data = pd.read_csv('merged_partial.csv')

In [None]:
def textrank(sentence, lower=True, window=2, topK=20, withWeight=False, allowPOS=()):
    """
    TextRank关键词提取
    :param sentence: str，句子，文本内容
    :param lower: bool，是否转小写处理
    :param window: int，共现窗口大小
    :param topK: int，返回几个TF值最高的词
    :param withWeight: bool，是否返回每个关键词的权重
    :param allowPOS: tuple，仅包括指定词性的词，默认为空，即不进行筛选
    :return: list，关键词列表
    """
    if lower:
        sentence = sentence.lower()
    
    wordNet = jieba.lcut(sentence)
    G = nx.Graph()
    wordNet = [word for word in wordNet if word not in jieba.analyse.extract_tags(sentence, topK=100)]
    
    G.add_nodes_from(wordNet)
    
    for idx, word in enumerate(wordNet):
        previous_word = wordNet[max(0, idx-window)]
        for i in range(1, window+1):
            following_word = wordNet[idx+i] if idx+i < len(wordNet) else ""
            if following_word and (not allowPOS or any([wordNet[idx].startswith(pos) for pos in allowPOS])):
                G.add_edge(previous_word, word)
            if following_word and (not allowPOS or any([following_word.startswith(pos) for pos in allowPOS])):
                G.add_edge(word, following_word)
    
    if G.number_of_edges() == 0: return []
    scores = nx.pagerank(G)
    
    if withWeight:
        return sorted(list(scores.items()), key=lambda x: x[1], reverse=True)[:topK]
    else:
        return sorted(scores, key=scores.get, reverse=True)[:topK]

# 示例使用
text = "自然语言处理是人工智能和语言学领域的分支学科。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。"
keywords = textrank(text, topK=5)
print(keywords)


In [None]:
# 特征提取
def extract_keywords(summary, top_k=6):
    if pd.isnull(summary):  # 检查是否为空值
        return []
    words = jieba.lcut(summary)
    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf_vectorizer.fit([summary])
    scores = tf_idf_vectorizer.idf_
    features = tf_idf_vectorizer.get_feature_names_out()
    sorted_scores = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)
    return [word for word, score in sorted_scores[:top_k]]

In [None]:
# 将剧情简介转换为关键词
def get_keywords_from_summary(summary):
    if pd.isnull(summary):
        return ""
    keywords = extract_keywords(summary, top_k=5)
    return ' '.join(keywords)

In [None]:
data['keywords'] = data['summary'].apply(get_keywords_from_summary)

# 向量化关键词
tfidf_vectorizer = TfidfVectorizer()
keyword_vectors = tfidf_vectorizer.fit_transform(data['keywords']).toarray()

# 将向量转换为DataFrame
keyword_df = pd.DataFrame(keyword_vectors, columns=tfidf_vectorizer.get_feature_names_out())

# 将关键词向量DataFrame与原始数据合并
data = pd.concat([data, keyword_df], axis=1)

# 查看向量化结果
data.to_csv('result.csv')

In [None]:
# 将其他文本特征（导演、编剧、演员、类型）向量化
data['director_vector'] = tfidf_vectorizer.fit_transform(data['director_name'].astype(str).apply(lambda x: ' '.join(x))).toarray()
data['author_vector'] = tfidf_vectorizer.fit_transform(data['author_name'].astype(str).apply(lambda x: ' '.join(x))).toarray()
data['actor_vector'] = tfidf_vectorizer.fit_transform(data['actor_name'].astype(str).apply(lambda x: ' '.join(x))).toarray()
data['genre_vector'] = tfidf_vectorizer.fit_transform(data['genre'].astype(str).apply(lambda x: ' '.join(x.split('/')))).toarray()

In [None]:
# 计算特征权重相似度
def calculate_feature_weight_similarity(feature_vectors, weights):
    weighted_similarity = np.dot(feature_vectors, weights)
    return weighted_similarity

# 特征权重
weights = np.array([0.5, 0.5, 0.5, 1, 1.5])  # 根据PDF文件中的权重

In [None]:
# 计算电影之间的相似度
data['movie_similarity'] = data.apply(lambda row: calculate_feature_weight_similarity(
    np.hstack([row['director_vector'], row['author_vector'], row['actor_vector'], row['genre_vector'], row['keyword_vector']]),
    weights), axis=1)

In [None]:
# 计算相似电影评分属性
def calculate_similar_movie_scores(data, top_k=10):
    similar_movie_scores = []
    for index, row in data.iterrows():
        similarities = row['movie_similarity']
        top_similar_indices = np.argsort(similarities)[::-1][1:top_k+1]  # 排除自身
        top_similar_scores = data.loc[top_similar_indices, 'rating']
        similar_movie_score = top_similar_scores.mean()
        similar_movie_scores.append(similar_movie_score)
    return pd.Series(similar_movie_scores)

data['similar_movie_score'] = calculate_similar_movie_scores(data)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 准备数据
X = data.drop(['rating', 'movie_id', 'movie_similarity'], axis=1)
y = data['rating']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 构建随机森林模型
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 预测
y_pred = rf.predict(X_test)

# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
