Load raw dataset

In [1]:
import pandas as pd
folder='../data/'
df=pd.read_csv(folder+'raw_data.csv', sep="|", encoding ='utf-8', index_col='Unnamed: 0')
print (df.shape[0])
df.head()

657041


  df=pd.read_csv(folder+'raw_data.csv', sep="|", encoding ='utf-8', index_col='Unnamed: 0')


Unnamed: 0,date,text,link_title,link_description,history_text,views,likes,comments,reposts
0,2014-11-20,Бузык барин булдирген,,,,,2,0,0
1,2014-11-03,Жандарым,,,,,0,0,0
2,2014-10-28,Алтын куз,,,,,2,0,0
3,2014-07-15,Здравствуйте,,,,,1,0,0
4,2014-07-14,Биз бакиттымыз,,,,,3,0,0


In [2]:
#drop duplicates:
df.drop_duplicates().shape[0]

554730

In [3]:
print ('number of text posts ', df[~df['text'].isnull()].shape[0])
print ('number of link-posts ', df[~df['link_title'].isnull()].shape[0])
print ('number of link posts with descriptions ', df[~df['link_description'].isnull()].shape[0])
print ('number of reposts ', df[~df['history_text'].isnull()].shape[0])

number of text posts  188246
number of link-posts  45650
number of link posts with descriptions  38992
number of reposts  481509


In [4]:
import numpy as np
df["history_text"] = df["history_text"].replace(np.nan, '')
df_reposts=df.loc[(df['history_text']!='') & (df['history_text']!='Запись удалена ')]
print ('number of reposts, without \'Запись удалена'': ', df_reposts['history_text'].shape[0])

number of reposts, without 'Запись удалена:  355762


In [7]:
# We work only with text posts:
df_text=df[~df['text'].isnull()]
print ("number of text posts: ", df_text.shape[0])

number of text posts:  188246


In [9]:
# Load only text data:
import pandas as pd
folder='../data/'
df=pd.read_csv(folder+'raw_text_data.csv', sep="|", encoding ='utf-8', index_col='Unnamed: 0')
print (df.shape[0])
df.head()

188246


Unnamed: 0,date,text,link_title,link_description
0,2014-11-20,Бузык барин булдирген,,
1,2014-11-03,Жандарым,,
2,2014-10-28,Алтын куз,,
3,2014-07-15,Здравствуйте,,
4,2014-07-14,Биз бакиттымыз,,


# Preprocessing

In [10]:
# Лемматизация
import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
from collections import Counter
nltk.download('stopwords')

# Стоп слова пока не рассматриваем
russian_stopwords = stopwords.words("russian")   
rsw = [word for word in russian_stopwords if word !="не"]
# print (rsw)

tokenizer = RegexpTokenizer('[А-Я|Ё|а-я|ё|#]+')
morph = pymorphy2.MorphAnalyzer()

def preprocessing(plain_text):
    intermediate = tokenizer.tokenize(plain_text.lower())
    intermediate = [morph.parse(i)[0].normal_form for i in intermediate if (len(i)>1)&('#' not in i)]
    words_lemmatized_list = [i for i in intermediate if i not in rsw]
    return words_lemmatized_list

def form_bigrams_list(preprocessed_text):
    bigrams_list=[]
    biword =  [b for b in nltk.bigrams(preprocessed_text)]
    counts_bi = Counter(biword)
    for char in counts_bi.keys():
        bigrams_list.append('_'.join(char))
    return bigrams_list
    
    
def get_counted_unigrams(preprocessed_text):
    counts_uni = Counter(preprocessed_text)  # счетчик частот униграмм  
    unigrams_count_line='' # unigrams in line with counts 
    for char in counts_uni.keys():
        unigrams_count_line=unigrams_count_line+char+':'+str(counts_uni[char])+' '
    return unigrams_count_line

def get_counted_bigrams(preprocessed_text):
    bigrams_count_line='' # bigrams in line with counts
    biword =  [b for b in nltk.bigrams(preprocessed_text)]
    counts_bi = Counter(biword)  # счетчик частот биграмм
    bigrams_count_line='' # bigrams in line with counts
    for char in counts_bi.keys():
        char_='_'.join(char)
        bigrams_count_line=bigrams_count_line+ char_ +':' +str(counts_bi[char]) +' ' 
    return bigrams_count_line

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mmilkov2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [232]:
# create columns with unigrams and bigrams
text_list=[]
for index, rows in df.iterrows():
    text_list.append(rows.text)
    
preprocessed_list=[]
bigrams_list=[]
for text in text_list:
    preprocessed_text=preprocessing(text)
    biword=form_bigrams_list(preprocessed_text)
    
    preprocessed_list.append(preprocessed_text)
    bigrams_list.append(biword)

df['unigrams']=preprocessed_list
df['bigrams']=bigrams_list    

In [11]:
# work only with posts with at least one bigram
df_with_bigrams=df[df.bigrams.apply(lambda x: len(x) > 0)]