In [56]:
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import pymorphy2

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('russian')
stop = set(stopwords.words('russian'))

def preproc_rpd(text):
    text = text['text']
    morph = pymorphy2.MorphAnalyzer()
    text = ' '.join([morph.parse(word)[0].normal_form for word in text.split()])
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    cleaned_tokens = [token for token in tokens if token.lower() not in stopword_list]  
#     cleaned_text = ' '.join(cleaned_tokens)     
    return cleaned_tokens

def preproc_df(text):
    for i in range(len(text)):
        if text[i] is not np.nan:
            morph = pymorphy2.MorphAnalyzer()
            text[i] = ' '.join([morph.parse(word)[0].normal_form for word in text[i].split()])
            tokens = tokenizer.tokenize(text[i])
            tokens = [token.strip() for token in tokens]
            cleaned_tokens = [token for token in tokens if token.lower() not in stopword_list]
            cleaned_text = ' '.join(cleaned_tokens)           
    return cleaned_text

In [77]:
import my_parser
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re


def null_cols(rpd):
    all_null_cols = df.loc[df == ''].index.to_numpy()
    acc_null_cols = ['Раздел 1.','Раздел 2.','2.1.','Раздел 3.','3.1.','3.2.','3.3.','3.4.']
    return list(set(all_null_cols).intersection(set(acc_null_cols)))

def year_num(text):    
    if re.search(r'20[0-9]{2}', text):
        year = re.search(r'20[0-9]{2}', text).group(0)
    else:
        year = np.nan
    number = re.search(r'[0-9]{6}', text).group(0)
    return year, number


def search_similarities(rpd):
    
    model = Doc2Vec.load('pm_fmkn_trained')
    mm = pd.read_csv('math-mech upd.csv')
    mm = mm.fillna('')
    cols = [column for column in mm]
    closest = dict()
    for c in cols:
        
        min = 1000
        for i in range(len(mm)):
            
            v_1 = model.infer_vector(rpd[c]).reshape(1, -1)
            v_2 = model.infer_vector(mm.loc[i][c].split()).reshape(1, -1)
            cos_dist = 1 - cosine_similarity(v_1, v_2)
            if cos_dist < min:
                min = cos_dist
                closest[c] = [i, cos_dist]
    return closest


def compliance(rpd, name):
    
    model = Doc2Vec.load('pm_fmkn_trained upd')
    
    v_d = model.infer_vector(rpd['2.2.']).reshape(1, -1)
    v_n = model.infer_vector([name]).reshape(1, -1)
    
    cos_sim = cosine_similarity(v_d, v_n)
    
    if cos_sim > 0:
        return True
    else:
        return False
                


def report(rpd_file):
    
    rpd_content = my_parser.feed_content(rpd_file)
    rpd = pd.DataFrame(rpd_content)
    rpd = rpd.drop(index='title')
    
#     display(rpd['Титульная страница']['text'])
    name = re.search(r"Ы\n([^a-z]*)\n", rpd['Титульная страница']['text']).group(0)[2:-1]
    print('Рабочая программа по дисциплине:', name)
    
    
    
    rpd = rpd.apply(preproc_rpd)  
    title_text =  ' '.join(rpd['Титульная страница'])
#     display(title_text)

    year, number = year_num(title_text)
    
    print('Номер:', number)
    print('Количество незаполненных разделов:', len(null_cols(rpd)))
    print('Рабочая программа соответсвует дисциплине:', compliance(rpd, name))
    
#     display(rpd['1.1.'])
#     print(number, year)
    print(search_similarities(rpd))
    return rpd


df = report('рпд-мм/001292_Теория представлений.docx')

Рабочая программа по дисциплине: Теория представлений
Номер: 001292
Количество незаполненных разделов: 0
Рабочая программа соответсвует дисциплине: True
{'Титульная страница': [504, array([[0.18319476]], dtype=float32)], 'Раздел 1.': [828, array([[0.4846182]], dtype=float32)], '1.1.': [20, array([[0.1449337]], dtype=float32)], '1.2.': [741, array([[0.2427991]], dtype=float32)], '1.3.': [7, array([[0.31362754]], dtype=float32)], '1.4.': [378, array([[0.2992177]], dtype=float32)], 'Раздел 2.': [47, array([[0.41195357]], dtype=float32)], '2.1.': [936, array([[0.40028006]], dtype=float32)], '2.1.1': [496, array([[0.45704204]], dtype=float32)], '2.2.': [20, array([[0.14087343]], dtype=float32)], 'Раздел 3.': [608, array([[0.48800492]], dtype=float32)], '3.1.': [106, array([[0.3330769]], dtype=float32)], '3.1.1': [666, array([[0.46085757]], dtype=float32)], '3.1.2': [126, array([[0.19425881]], dtype=float32)], '3.1.3': [866, array([[0.16167611]], dtype=float32)], '3.1.4': [20, array([[0.1338

In [49]:
import pandas as pd
import re
import numpy as np

mm = pd.read_csv('math-mech.csv')

def year(text):
    if text is not np.nan:
        if re.search(r'20[0-9]{2}', text):
            return re.search(r'20[0-9]{2}', text).group(0)
        else:
            return np.nan

def number(text):
    if text is not np.nan:
        return re.search(r'[0-9]{6}', text).group(0)
    
mm['year'] = mm['Титульная страница'].apply(year)
mm['number'] = mm['Титульная страница'].apply(number)

mm_upd = pd.read_csv('math-mech upd.csv')

mm_upd.join(mm['year'])
mm_upd.join(mm['number'])
mm_upd.to_csv('math-mech upd num.csv', index = False)