In [2]:
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import pymorphy2
import string
import os

In [3]:
#Обработка текста
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('russian')

def preproc_rpd(text):
    text = text['text']
    morph = pymorphy2.MorphAnalyzer()
    text = re.sub(r'\n', '', text)
    text = text.translate(str.maketrans('','', string.punctuation))
    text = ' '.join([morph.parse(word)[0].normal_form for word in text.split()])
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    cleaned_tokens = [token for token in tokens if token.lower() not in stopword_list]  
    cleaned_text = ' '.join(cleaned_tokens)     
    return cleaned_text

In [None]:
import my_parser
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re


def null_cols(rpd, rpd_prep):
    rep = ''
    all_cols = ['Титульная страница', 'Раздел 1.', '1.1.', '1.2.', '1.3.', '1.4.',
       'Раздел 2.', '2.1.', '2.1.1', '2.2.', 'Раздел 3.', '3.1.', '3.1.1',
       '3.1.2', '3.1.3', '3.1.4', '3.1.5', '3.2.', '3.2.1', '3.2.2', '3.3.',
       '3.3.1', '3.3.2', '3.3.3', '3.3.4', '3.3.5', '3.4.', '3.4.1', '3.4.2',
       '3.4.3', 'Раздел 4.']
    rpd_cols = rpd.columns
   
    if set(rpd_cols)- set(all_cols) == set():
        structure = 'Верна \n'
    else:
        structure = 'Не верна \n'
    rep += f'Структура: {structure}'
        
    all_null_cols = rpd_prep.loc[rpd_prep == ''].index.to_numpy()
    acc_null_cols = ['Раздел 1.','Раздел 2.','2.1.', '2.1.1','Раздел 3.','3.1.','3.2.','3.3.','3.4.']
    null_cols = list(set(all_null_cols)-set(acc_null_cols))
    rep += f'Количество незаполненных или мало заполненных разделов: {len(null_cols)} \n'
    if len(null_cols) > 0:
        rep += f'Недостаток информации в разделах: {null_cols} \n'
    for c in null_cols:
        rep += f'Раздел: {c} \n'
        rep += f'Текст данной рпд: {rpd[c]["text"]} \n'
    
    return rep

def year_num(text):    
    if re.search(r'20[0-9]{2}', text):
        year = re.search(r'20[0-9]{2}', text).group(0)
    else:
        year = np.nan
    number = re.search(r'[0-9]{6}', text).group(0)
    return year, number


def search_similarities(rpd, rpd_prep):
    
    model = Doc2Vec.load('pm_fmkn_trained upd dm0')
    mm = pd.read_csv('math-mech upd.csv')
    mm = mm.fillna('')
    cols = rpd.columns
    closest = dict()
    for c in cols:
        
        max = -1000
        for i in range(len(mm)):
            
            v_1 = model.infer_vector(rpd_prep[c].split()).reshape(1, -1)
            v_2 = model.infer_vector(mm.loc[i][c].split()).reshape(1, -1)
            cos_dist = cosine_similarity(v_1, v_2)
            if cos_dist > max:
                max = cos_dist
                closest[c] = [i, cos_dist]
                
    mm_or = pd.read_csv('math-mech.csv')           
    rep = 'Похожие тексты: \n'
    for c in cols:
        if closest[c][1] > 0.9:
            rep += f'Номер раздела: {c}, Косинусное расстояние: {closest[c][1][0][0]} \n'
            rep += f'Текст данной рпд: {rpd[c]["text"]} \n'
            rep += f'Похожий текст: {mm_or[c][closest[c][0]+1]} \n'
        
    return rep


def compliance(rpd, name):
    morph = pymorphy2.MorphAnalyzer()
    name = ' '.join([morph.parse(word)[0].normal_form for word in name.split()])
    model = Doc2Vec.load('pm_fmkn_trained upd dm0')
    
    v_d = model.infer_vector(rpd['2.2.'].split()).reshape(1, -1)
    v_n = model.infer_vector(name.split()).reshape(1, -1)
    
    cos_sim = cosine_similarity(v_d, v_n)
    
    if cos_sim > 0:
        return cos_sim
    else:
        return cos_sim

def write(report, number):
    with open(f"reports/{number}.txt", "w") as file:
        file.write(report)
                      

def report(rpd_file):
    
    report_text = ''
    rpd_content = my_parser.feed_content(rpd_file)
    rpd = pd.DataFrame(rpd_content)
    rpd = rpd.drop(index='title')
    
    name = re.search(r"Ы\n([^a-z]*)\n", rpd['Титульная страница']['text']).group(0)[2:-1]
    report_text += f'Рабочая программа по дисциплине:  {name} \n'
    
    rpd_prep = rpd.apply(preproc_rpd)  
    title_text =  rpd_prep['Титульная страница']
  
    year, number = year_num(title_text)
    
    report_text += null_cols(rpd, rpd_prep)
    
    report_text += f'Рабочая программа соответсвует дисциплине: {compliance(rpd_prep, name)} \n'
    
    report_text += search_similarities(rpd, rpd_prep)
    
#     write(report_text, number)
    return report_text


In [4]:
#Добавление номера и года рпд в датасет матмеха

import pandas as pd
import re
import numpy as np

mm = pd.read_csv('math-mech.csv')

def year(text):
    if text is not np.nan:
        if re.search(r'20[0-9]{2}', text):
            return re.search(r'20[0-9]{2}', text).group(0)
        else:
            return np.nan

def number(text):
    if text is not np.nan:
        return re.search(r'[0-9]{6}', text).group(0)
    
mm['year'] = mm['Титульная страница'].apply(year)
mm['number'] = mm['Титульная страница'].apply(number)

mm_upd = pd.read_csv('math-mech upd.csv')

mm_upd.join(mm['year'])
mm_upd.join(mm['number'])
mm_upd.to_csv('math-mech upd num.csv', index = False)