# 1. Data Pre-Processing

In [None]:
import pandas as pd

df = pd.read_excel("../data/raw.xlsx")
df.head()

In [None]:
# 预处理
def preprocessing():
    for i in range(1,4):
        # X1,X2,X3分别讨论
        total_col = 'X%s'%i # 生成总分列名
        cols = ['X%s%s'%(i,j) for j in range(1,5)] # 生成小分列名
        for j in range(4):
            # 找出满足Xij>25的数据 of 空值
            dfd = df[df[cols[j]] > 25 | df[cols[j]].isnull()]
            if dfd.empty:
                continue
            # 按照总分反推小分
            dfd.loc[:, cols[j]] = dfd.loc[:,total_col] - dfd.loc[:,cols[(j+1)%4]] - dfd.loc[:,cols[(j+2)%4]] - dfd.loc[:,cols[(j+3)%4]]
            # 更新修改的数据行
            df.update(dfd)
        # 处理总分与小分不一致的情况
        df.loc[:,total_col] = dfd.loc[:,cols[0]:cols[-1]].sum(axis=1)

In [None]:
# 计算总分
def totalCal():
    df['Avg1'] = df[['X11','X21','X31']].mean(axis=1)
    df['Avg2'] = df[['X12','X22','X32']].mean(axis=1)
    df['Avg3'] = df[['X13','X23','X33']].mean(axis=1)
    df['Avg4'] = df[['X14','X24','X34']].mean(axis=1)
    df['Avg'] = df[['X1','X2','X3']].mean(axis=1)

In [None]:
preprocessing()
totalCal()

df.to_excel('../data/preprocessed/preprocessed.xlsx')

# 2. Comment Text Analysis

In [None]:
import pandas as pd
from jieba import lcut
from jieba import cut
import jieba.posseg
import re

## 2.1 Tools Function

In [None]:
# stopwords
def loadStopwords():
    stopwords1 = pd.read_csv('/Users/antonchekhov/Desktop/BA_nlp/stopwords/中文停用词库.txt', encoding="gbk",names=["stopword"])["stopword"].tolist()
    stopwords2 = pd.read_csv('/Users/antonchekhov/Desktop/BA_nlp/stopwords/百度停用词列表.txt', encoding="gbk",names=["stopword"])["stopword"].tolist()
    stopwords3 = pd.read_csv('/Users/antonchekhov/Desktop/BA_nlp/stopwords/四川大学机器智能实验室停用词库.txt',encoding="gbk",names=["stopword"])["stopword"].tolist()
    stopwords = stopwords3 + stopwords2 + stopwords1
    stopwords.extend(list(';: .)(（）-——①②③④⑤⑥⑦⑧⑨'))
    return stopwords
# stopwords = loadStopwords()


# tokinize
def tokenize(sent, func):
    """
    去停用词+分词操作
    :param sent: 短句文本
    :param func: 分词操作函数
    :param stopwords: 停用词表
    :return: 词语列表
    """
    # 将句子分割成词语列表
    words = func(sent)
    target_ls = []
    for word in words:
        # 去除停用词
        # 结果中可能包含重复词汇
        if word not in stopwords:
            target_ls.append(word)
    return target_ls

# tokenize according to pos
def cut_words_with_pos(text):
    seg = jieba.posseg.cut(text)
    res = []
    for i in seg:
        if i.flag in ["a","ad", "n", "an", "vn", "nz", "nt", "nr"]:
            res.append(i.word)
    return list(res)

# sentence segmentation
def sentCut(sents):
    """
    将评论分为短句（不分词）
    :param sents: 整段评论文字
    :return: 短句列表
    """
    if type(sents) != str:
        return []
    sents = sents.strip() # 去除前后空格
    sent_ls = re.split(r'，|。|；|\0',sents) # 按句号、分号、逗号进行划分
    sent_ls = [s for s in sent_ls if len(s)>0] # 去除空字符
    return sent_ls

## 2.2 Construct Feature Word Dictionary 

1. 用于模型（tf-idf, word2vec）训练的数据：所有评论分词+去停用词；全部整合在一起 （total_text list）

2. 用于构建词典的分专业类别评论：grouped; 按专业整合再在一起 (subject_map dict)

3. 用于每篇论文情感计算和打分的评论 (df["RList"])

4. 评价标准处理——分词，提取关键词 (target_set)

In [None]:
# 读取预处理后的数据
df = pd.read_excel('../data/preprocessed/preprocessed.xlsx')

# 保留评论列R1,R2,R3
reviews_df = df[['Tag','R1','R2','R3']]
for col in ['R1','R2','R2']:
    reviews_df[col].map(str)
# 删除空值，此为全部删除
# 若指定超过两个空值就删除：df.dropna(axis = 0, thresh = 2)
reviews_df = reviews_df.dropna()

# 将评论数据处理为短句列表
# [" ",sent2,...]
# 存入review_df中的R_ls中
R_ls = []
for row in reviews_df.itertuples():
    # 遍历每一行
    r_ls = [sentCut(row.R1), sentCut(row.R2), sentCut(row.R3)]
    R_ls.append(r_ls)
reviews_df.loc[:,"Rlist"] = R_ls

# 按照tag分组
grouped_reviews = reviews_df.groupby(df['Tag'])

In [None]:
for name,group in grouped_reviews:
    print('---------',name,'----------')
    print(group.info())

In [None]:
print(reviews_df.info())

In [None]:
from collections import Counter
text_df = reviews_df[reviews_df['Tag']==8]
R_ls = ""
for row in text_df.itertuples():
    # 遍历每一行
    for col in range(2,5):
        if type(row[col]) != float:
            R_ls += row[col]
    
words = lcut(R_ls)
words = [word for word in words if word not in stopwords]

from collections import Counter
result = Counter(words).most_common(20) #取最多的50组
#print(result)

# 绘制词云图
from wordcloud import WordCloud #导入相关库
import wordcloud
content = ' '.join(words) #把列表转换为字符串
font_path="/System/Library/fonts/PingFang.ttc"
wc = WordCloud(font_path = font_path,
               color_func=wordcloud.get_single_color_func("black"),
               background_color='white',#背景颜色（这里为白色）
                width=1000,#宽度
                height=600,#高度
                 ).generate(content) #绘制词云图
wc.to_file('WordCloud_8.png')


In [None]:
reviews_df.head()

In [None]:
# dict: 按专业存储经过分词的文本数据
# key: 专业编号
# value: 字符串列表，列表每个元素代表一个短句，每个短句形式为“word1 word2 word3”
subject_map = {}
total_text = []
for name, group in grouped_reviews:
    # 以列表形式储存字符，每个元素为一段评语
    sent_ls = []
    for row in group.itertuples():
        rlist = row.Rlist
        sent = rlist[0] + rlist[1] + rlist[2]
        sent = [" ".join(cut_words_with_pos(r)) for r in sent]
        sent_ls += sent
    subject_map[name] = sent_ls
    total_text += (sent_ls)

In [None]:
# 评价要素
targets = ["选题与综述：研究的理论意义，实用；对本学科及相关学科领域国内外发展状况和 学术动态的了解程度",
            "创新性及论文价值：论文提出的新见解、新方法所具有的价值.论文成果对科技进步、经济建设、国家安全等方面产生的影响或作用",
            "科研能力与基础知识：论文体现的理论基础的扎实程度；本学科及相关学科领域专门知识的系统性；分析问题、解决问题的能力；研究方法的科学性，是否采用先进技术、设备、信息等进行论文研究工作。",
            "论文规范性:引文的规范性，学风的严谨性；论文语言表达的准确性、逻辑的严密性、书写格式及图表的规范性" ]


# 评价指标部分分词分词
# map: 第i指标 -> keywords
target_dict = {}
for i, target in enumerate(targets):
    words = tokenize(target, lcut)
    word_set = []
    # 去除重复词语
    for word in words:
        if word not in word_set:
            word_set.append(word)
    target_dict[i] = word_set

### 2.2.1 Latent Dirichlet Analysis

In [None]:
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

In [None]:
# lda各专业主题
def lda(text):
    dictionary = corpora.Dictionary(text)  # 构建词典
    corpus = [dictionary.doc2bow(t) for t in text]  #表示为第几个单词出现了几次
    num_topics=4
    ldamodel = LdaModel(corpus, num_topics=40, id2word = dictionary, passes=30,random_state = 1)   #分为4个主题
    print(ldamodel.print_topics(num_topics=num_topics, num_words=10))  #每个主题输出10个单词

    
for key,value in subject_map.items():
    text = [sent.split() for sent in value]
    print("----------Tag = %s----------"%key)
    lda(text)
    print("---------------------------")

### 2.2.2 modified K-Means

#### K-Means base on Word2Vec

In [None]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.cluster import KMeans
import random
import numpy as np

SEED = 42
random.seed(SEED)

In [None]:
def vectorize(cut_word_list, model):
    features = []
    for tokens in cut_word_list:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

def KmeansByWord2vec(tag,maxN):
    """
    :param tag: tag
    :param maxN: 最大输出词数
    :return: 各学科补充词列表
    """
    docs = subject_map[tag]
    docs = [doc.split() for doc in docs]
    model = Word2Vec(sentences=docs, vector_size=10, workers=1, seed=SEED)
    X = vectorize(docs, model=model)
    keys = model.wv.key_to_index
    kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(X)
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    result_ls = [[],[],[],[]]
    distance_ls = [[],[],[],[]]
    for key,value in keys.items():
        index = labels[value]
        result_ls[index].append(key)
        dist = np.dot((centroids[index]-value).T,(centroids[index]-value))
        distance_ls[index].append(dist)
    sorted_result = sorted(result_ls, key=lambda x: distance_ls[result_ls.index(x)])
    for i in range(len(sorted_result)):
        res = sorted_result[i]
        if len(res) > maxN: 
            sorted_result[i] = res[:maxN]
    return sorted_result

In [None]:
# 以Tag=1,maxN=20为例
KmeansByWord2vec(1,20)

In [None]:
write = pd.ExcelWriter("/Users/antonchekhov/Desktop/Keywords.xlsx")
maxN = 20
for key in subject_map.keys():
    print("----------Tag = %s----------"%key)
    KWs = KmeansByWord2vec(key,maxN)
    # KWs = pd.DataFrame(KWs)
    print(KWs)
    # KWs.to_excel(write,'KM%s'% key)

# 3. Sentiment Calculation

In [None]:
from snownlp import SnowNLP
import numpy as np

In [None]:
s1 = '选题具有理论意义和现实意义，论文语言流畅，总体架构基本合理'
s2 = '论文写作粗糙，表格图表不严谨。'

for s in [s1,s2]:
    print('The sentence: ',s)
    print('The sentiment score of the sentence: ', SnowNLP(s).sentiments)
    print("\n")

In [None]:
def sentimentsCal(text, target_dict):
    """
    计算短句的情感值
    :param text: 短句文本
    :param target_dict: 指标词典
    :return: 情感值列表[,,]
    """
    scores = [0,0,0,0]
    count = [0,0,0,0]
    sentiment = SnowNLP(text).sentiments
    words = tokenize(text, lcut)
    '''
    for key, value in enumerate(target_dict):
        count[key] = len(value)
        for kw in value:
            if kw in words:
                scores[key] += 25
            else:
                scores[key] += 22.5
    scores = [ scores[i]/count[i]*sentiment for i in range(4) ]
    '''
    for key,value in enumerate(target_dict):
        for kw in value:
            if kw in words:
                scores[key] += sentiment
                count[key] += 1
    for i,cnt in enumerate(count):
        if cnt == 0:
            scores[i] = 1
        else:
            scores[i] = scores[i] / count[i]

    return scores

def ReviewSentiments(reviews,target_dict):
    """
    计算句子列表的情感值
    :param reviews: 短句列表
    :return: 情感值列表np.array([,,])
    """
    num = len(reviews)
    if num == 0:
        return 0
    scores = [ sentimentsCal(review,target_dict) for review in reviews]
    scores = np.array(scores)
    mean =np.sum(scores,axis=0)/num
    return mean
    

In [None]:
path = "../results/feature_dictionary/mergeDict.xlsx"
cols = ["选题与综述","创新与论文价值","科研能力与基础知识","论文规范性"]
new_reviews_df = reviews_df.copy()

In [None]:
scores = [[],[],[]]

for row in reviews_df.itertuples():
    review_ls = row.Rlist
    tag = row.Tag
    keyword_df = pd.read_excel(path,str(tag))
    keyword_dict = [list(keyword_df[col]) for col in cols]
    for i,reviews in enumerate(review_ls):
        score = ReviewSentiments(reviews, keyword_dict)
        scores[i].append(score)

In [None]:
for i in range(1,4):
    score_ls = scores[i-1]
    for j in range(1,5):
        sls = []
        for score in score_ls:
            if type(score) == int:
                sls.append(-1)
            else:
                sls.append(score[j-1])
        new_reviews_df["S%s%s"%(i,j)] = pd.Series(sls)

In [None]:
# new_reviews_df.head(10)
# new_reviews_df.to_excel("result.xlsx")