In [3]:
# from credentials import token
from requests.exceptions import HTTPError, Timeout
from datetime import date, datetime
from pymystem3 import Mystem
import matplotlib.pyplot as plt
import collections
import requests
import json
import os
import re

In [None]:
# Receives a datetime object, converts it into age
def get_age(birthday):
    today = date.today()
    age_s = today.year - birthday.year
    if today.month < birthday.month:
        age_s -= 1
    elif today.month == birthday.month and today.day < birthday.day:
        age_s -= 1
    return age_s


# Thoroughly cleans the string
def clean(string):
    string = string.replace(',', '!').replace('!', '?')
    string = string.replace('?', '.').replace('.', '-')
    string = string.replace('-', ':').replace(':', ';')
    string = string.replace(';', ')').replace(')', '(')
    string = string.replace('(', '').replace('—', '').strip()
    return string


# Prepares data for csv
def reformat_csv(status, post_id, poster_id, first_name, last_name,
                 sex, city, bdate, age, education, text):
    clean_text = clean(text)
    length = len(clean_text)
    string = '%s,%s,%s,%s,%s,%s,%s,%s,%s,"%s",%s,"%s"' % (
        status, post_id, poster_id, first_name, last_name,
        sex, city, bdate, age, education, length, text)
    return string


# Appends lines to a csv file
def make_csv(line):
    string = 'status,post_id,owner_id,name,surname,sex,city,' \
             'bdate,age,education,length,text,comments_id\n'
    if not os.path.exists('stats.csv'):
        with open('stats.csv', 'w', encoding='utf-8') as f:
            f.write(string)
    with open('stats.csv', 'a', encoding='utf-8') as f:
        f.write(line)

In [None]:
# Gets all the necessary user info
def get_poster_info(poster_id, token):
    if poster_id < 0:
        (first_name, last_name, sex, city, bdate,
         age, education) = None, None, None, None, None, None, None
        return first_name, last_name, sex, city, bdate, age, education
    # new parameters for users.get method
    params = {'access_token': token, 'v': '5.95', 'user_id': poster_id,
              'fields': 'city,sex,first_name,last_name,bdate,education'}
    try:
        req = requests.get('https://api.vk.com/method/users.get',
                           params=params)
        data = req.text
        req.raise_for_status()
    except HTTPError or Timeout:
        print('Could not get User Info')
        return None
    else:
        data = json.loads(data)
        try:
            items = data['response'][0]
        except KeyError:
            return None
        else:
            first_name = items['first_name']  # poster/commenter name
            last_name = items['last_name']  # poster/commenter surname
            sex = items['sex']
            # specifies user's sex
            if sex == 1:
                sex = 'female'
            elif sex == 2:
                sex = 'male'
            else:
                sex = 'unspecified'
            # tries to get the name of the city
            try:
                city = items['city']['title']
            except KeyError:
                city = None
            # gets user's age if possible
            try:
                bdate = items['bdate']
            except KeyError:
                bdate = None
                age = None
            else:
                try:
                    day, month, year = bdate.split('.')
                except ValueError:
                    age = None
                else:
                    time_string = day + '/' + month + '/' + year
                    birthday = datetime.strptime(time_string, '%d/%m/%Y')
                    age = get_age(birthday)
            # tries to get one of the degrees if given
            try:
                education = items['university_name']
            except KeyError:
                education = None
            else:
                education = clean(education)
    return first_name, last_name, sex, city, bdate, age, education

In [None]:
# Gets comments for post, returns info about a commenter
def get_comment_info(post_id, community_id, token):
    # while True, try to get the post's comments
    comments = []
    strings = []
    offset = 0
    while True:
        if offset:
            params = {'access_token': token, 'v': '5.95',
                      'owner_id': community_id, 'post_id': post_id,
                      'count': 100, 'offset': offset}
        else:
            params = {'access_token': token, 'v': '5.95',
                      'owner_id': community_id, 'post_id': post_id,
                      'count': 100}  # all parameters
        try:
            req = requests.get('https://api.vk.com/method/wall.getComments',
                               params=params)
            data = req.text
            req.raise_for_status()  # as usual, we try to mine stuff
        except HTTPError or Timeout:
            print('Could not mine wall for data')
        else:
            status = 'comment'
            data = json.loads(data)
            items = data['response']['items']
            if not items:  # if no comments, returns blank list
                break
            offset += 100
            for item in items:
                try:
                    text = item['text']
                except KeyError:
                    text = None
                else:
                    text = text.replace('\n', ' ')
                comment_id = item['id']
                try:
                    poster_id = item['from_id']
                except KeyError:
                    pass
                else:
                    # gets info on the commenter
                    (first_name, last_name, sex, city, bdate,
                     age, education) = get_poster_info(poster_id, token)
                    comments.append(comment_id)  # gets list of comments
                    # makes a string ready for csv input
                    string = reformat_csv(status, post_id, poster_id,
                                          first_name, last_name, sex,
                                          city, bdate, age, education, text)
                    string += ',' + str(comment_id) + '\n'
                    strings.append(string)
    return comments, strings

In [None]:
# Gets all the necessary info to make csv
def get_poster():
    offset = 0
    # while True, tries to get data
    while True:
        print('Posts mined', ':', offset)
        token = input('Ваш токен: ')
        # giving the necessary parameters
        params = {'access_token': token, 'domain': 'proekt_ne_gotov',
                  'v': '5.95', 'offset': offset, 'count': '100'}
        try:
            req = requests.get('https://api.vk.com/method/wall.get',
                               params=params)
            data = req.text
            req.raise_for_status()
            # if download successful, proceeds to work with data
        except HTTPError or Timeout:
            print('Could not mine wall for data')
        else:
            data = json.loads(data)
            items = data['response']['items']
            if not items:
                break  # kills the process when offset reaches a limit
            for item in items:
                post_id = item['id']  # gets the id of the post
                community_id = item['owner_id']
                # i don't know why my code does this every cycle, must rewrite
                text = item['text']  # gets the messages text
                try:
                    poster_id = item['signer_id']  # some posts are not signed
                except KeyError:
                    poster_id = community_id
                if poster_id != community_id:
                    (first_name, last_name, sex, city, bdate,
                     age, education) = get_poster_info(poster_id, token)
                    status = 'post'
                    comments, strings =\
                        get_comment_info(post_id, community_id, token)
                    text = text.replace('\n', ' ')
                    post = reformat_csv(status, post_id, poster_id,
                                        first_name, last_name, sex, city,
                                        bdate, age, education, text)
                    post += ',' + str(comments) + '\n'
                    make_csv(post)
                    for string in strings:
                        make_csv(string)
            offset += 100

In [None]:
# Making unlemmatized files
def making_unlem():
    with open('stats.csv', 'r', encoding='utf-8') as f:
        text = f.readlines()
    lis_t = []
    for line in text:
        sline = re.search('"(.*)"', line)
        if sline:
            lis = re.search(',"(.*)"', sline.group())
            if lis:
                lis = lis.group().replace(',"', '"').strip('"') + '\n'
                if lis:
                    lis_t.append(lis)
    with open('non_lemm_text.txt', 'w', encoding='utf-8') as f:
        f.writelines(lis_t)

# Making lemmatized files
def making_lem():
    with open('non_lemm_text.txt', 'r', encoding='utf-8') as f:
        text = f.readlines()
    m = Mystem()
    print('making lemmatized file')
    lemlines = []
    i = 0
    for line in text:
        if line == '\n':
            pass
        else:
            lemline = m.lemmatize(line)
            i += 1
            print('lemmatized line: ', i)
            lemlines.append(lemline)
    with open('lemm_text.txt', 'w', encoding='utf-8') as f:
        for line in lemlines:
            f.writelines(line)

In [None]:
# Gets all the necessary params for graphs
def parameters():
    with open('stats.csv', 'r', encoding='utf-8') as f:
        lines = f.readlines()
    sex_s = []
    education_s = []
    com_leng = []
    citie_s = []
    age_s = []
    compared_length_s = []
    length = 0
    checker = lines[1].split(',')[1]
    for line in lines:
        sline = line.split(',')
        if line.startswith('post'):
            sex = sline[5]
            education = sline[9]
            length = int(sline[10])
            sex_s.append((length, sex))
            education_s.append((length, education))
        else:
            # если у коммента и поста один айди
            pcom_id = sline[1]
            try:
                age = int(sline[8])
                length_com = int(sline[10])
            except ValueError:
                length_com = 0
            else:
                city = sline[6]
                age_s.append((length_com, age))
                citie_s.append((length_com, city))
            if checker == pcom_id:
                try:
                    leng = int(length_com)
                except ValueError:
                    pass
                else:
                    com_leng.append(leng)
            else:
                try:
                    aver = sum(com_leng) / len(com_leng)
                except ZeroDivisionError:
                    pass
                else:
                    compared_length_s.append((length, round(aver, 3)))
                    checker = pcom_id
    return sex_s, education_s, age_s, citie_s, compared_length_s

In [None]:
# Comparing length of post with average length of its' comment
def compared_graph(compared):
    x = []
    y = []
    for key, value in sorted(compared, key=lambda z: z[0]):
        x.append(key)
        y.append(value)
    plt.title('Сравнительная длина поста и его комментариев')
    plt.xlabel('Длины поста')
    plt.ylabel('Длины комментариев')
    plt.plot(x, y, c='#ffa62b', marker='^')
    plt.savefig('post_comment_lengths_compare.png')

In [None]:
# Comparing sex of poster with average length of their post
def sex_graph(sex):
    fem = []
    mas = []
    for length, sexs in sorted(sex, key=lambda z: z[0]):
        if sexs == 'female':
            fem.append(length)
        else:
            mas.append(length)
    fem = sum(fem) / len(fem)
    mas = sum(mas) / len(mas)
    plt.title('Сравнение средней длины постов по полу')
    plt.xlabel('Средняя длина поста')
    plt.ylabel('Пол')
    plt.bar(('male', 'female'), (mas, fem), align='center')
    plt.savefig('sex_vs_post_length_bar.png')

In [None]:
# Comparing education of poster with average length of post
def education_graph(edu):
    x = []
    y = []
    for key, value in sorted(edu, key=lambda z: z[0]):
        if value != '"None"' and value != '""':
            x.append(key)
            y.append(value)
    plt.title('Сравнение длины поста с наличием образования')
    plt.xlabel('Длина поста')
    plt.ylabel('Образование')
    plt.scatter(x, y, c='crimson', marker='D')
    plt.savefig('education_vs_post_length.png')

In [None]:
# Comparing age of commenter with comment's length
def age_graph(ag):
    x = []
    y = []
    for key, value in sorted(ag, key=lambda z: z[0]):
        x.append(key)
        y.append(value)
    plt.title('Сравнение длины комментариев с возрастом')
    plt.xlabel('Длина комментариев')
    plt.ylabel('Возраст')
    plt.scatter(x, y, c='#fd798f', marker='X')
    plt.savefig('age_vs_comment_length.png')

In [None]:
# Comparing city of commenter with comment's length
def cities_graph(cit):
    x = []
    y = []
    for key, value in sorted(cit, key=lambda z: z[0]):
        if value != 'None' and key != 0:
            x.append(key)
            y.append(value)
    plt.title('Сравнение длины комментариев с городом проживания')
    plt.xlabel('Длина комментариев')
    plt.ylabel('Город')
    plt.scatter(x, y, c='#a87dc2', marker='*')
    plt.savefig('city_vs_comment_length.png')

In [None]:
# Makes a graph for unlemmatized file
def quantity_unlem():
    x = []
    y = []
    with open('rus_stopwords.txt', 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
    with open('non_lemm_text.txt', 'r', encoding='utf-8') as f:
        text = f.read()
    clean_text = []
    text = clean(text).split()
    for word in text:
        for stopword in stopwords:
            if word == stopword.strip('\n'):
                word = ''
        if word:
            clean_text.append(word.lower())
    d = dict(collections.Counter(clean_text).most_common(25))
    for key, value in d.items():
        x.append(key)
        y.append(value)
    plt.title('Частотные и нелемматизированные')
    plt.xlabel('Слова')
    plt.ylabel('Частотность')
    plt.plot(x, y, c='#bffe28')
    plt.savefig('quantitative_nonlemmatized.png')

In [None]:
# Makes a graph for a lemmatized file
def quantity_lem():
    x = []
    y = []
    with open('rus_stopwords.txt', 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
    with open('lemm_text.txt', 'r', encoding='utf-8') as f:
        text = f.read()
    clean_text = []
    text = clean(text).split()
    for word in text:
        for stopword in stopwords:
            if word == stopword.strip('\n'):
                word = ''
        if word:
            clean_text.append(word.lower())
    d = dict(collections.Counter(clean_text).most_common(25))
    for key, value in d.items():
        x.append(key)
        y.append(value)
    plt.plot(x, y, c='#4da409')
    plt.title('Частотные и лемматизированные')
    plt.xlabel('Слова')
    plt.ylabel('Частотность')
    plt.savefig('quantitative_lemmatized.png')

In [4]:
def main():
    get_poster()
    making_unlem()
    making_lem()
    sexes, educations, ages, cities, compared_lengths = parameters()
    compared_graph(compared_lengths)
    sex_graph(sexes)
    education_graph(educations)
    age_graph(ages)
    cities_graph(cities)
    quantity_unlem()
    quantity_lem()