In [235]:
import pandas as pd
import numpy as np
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [236]:
data = pd.read_csv('gofundme.csv')

In [237]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408 entries, 0 to 407
Data columns (total 14 columns):
web-scraper-order        408 non-null object
web-scraper-start-url    408 non-null object
url                      408 non-null object
url-href                 408 non-null object
post_name                408 non-null object
story                    404 non-null object
amount_donated           408 non-null object
amount_goal              408 non-null object
donatedby_pplcount       405 non-null object
likes                    405 non-null object
shares                   401 non-null object
location                 405 non-null object
post_created_on          405 non-null object
image-src                348 non-null object
dtypes: object(14)
memory usage: 44.7+ KB


In [238]:
def set_missing(df):
    known = df[df.story.notnull()]
    unknown = df[df.story.isnull()]
    var = known.columns.tolist()
    X = known.loc[:, var ].drop('story',axis = 1)
    y = known['story']
    # replace predicted values
    predicted = ' '
    df.loc[(df.story.isnull()), 'story'] = predicted
    return df.info()

In [239]:
set_missing(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408 entries, 0 to 407
Data columns (total 14 columns):
web-scraper-order        408 non-null object
web-scraper-start-url    408 non-null object
url                      408 non-null object
url-href                 408 non-null object
post_name                408 non-null object
story                    408 non-null object
amount_donated           408 non-null object
amount_goal              408 non-null object
donatedby_pplcount       405 non-null object
likes                    405 non-null object
shares                   401 non-null object
location                 405 non-null object
post_created_on          405 non-null object
image-src                348 non-null object
dtypes: object(14)
memory usage: 44.7+ KB


In [2]:
def count_words(dataframe,column):
    captions = []
    for i in range(dataframe.shape[0]):
        captions.append(pd.DataFrame(dataframe.iloc[:][column]).loc[i][0].lower())
    wordcount = Counter(pos_tag(word_tokenize(''.join(captions))))
    word_list = sorted(list(wordcount.items()), key = lambda w: -w[1])
    stoplist = nltk.corpus.stopwords.words('english')
    word_list = [word_list[i] for i in range(len(word_list)) 
                 if len(word_list[i][0][0]) > 2 and word_list[i][0][0] not in stoplist 
                 and word_list[i][0][1] in ['NN', 'NNP', 'NNS', 'JJ']]
    return word_list

In [1]:
count_words(data, 'post_name')

NameError: name 'count_words' is not defined

# Sentiment

In [242]:
# from sklearn.metrics.pairwise import cosine_similarity
# from nltk.tokenize import word_tokenize
# from math import log
# import numpy as np
# import csv
# import re
# from collections import namedtuple


from nltk.sentiment.vader import SentimentIntensityAnalyzer
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


def get_story_sentiment_score(stories):
    sentiments = []
    analyser = SentimentIntensityAnalyzer()
    for story in stories:
        story_sentiment = analyser.polarity_scores(story)['compound']
        sentiments.append(story_sentiment)
    return sentiments

data['sentiment'] = get_story_sentiment_score(data['story'])

# Length of description

In [244]:
data['story_length'] = [len(story) for story in data['story']]

# Money transformation

In [245]:
data['amount_donated'] = [int(amount[1:].replace(',','')) for amount in data['amount_donated']]

In [262]:
goal = []

for gathered in data['amount_goal']:
    if gathered == 'raised':
        goal.append(data['amount_donated'])
    else:
        a = gathered[4:-5].replace(',','')
        if 'M' in a:
            b = int(a[:-1].replace('.',''))*100000
            goal.append(b)
        else:
            goal.append(int(a))


data['amount_goal'] = goal

In [270]:
data['percentage'] = data['amount_donated'] / data['amount_goal']

# END

In [231]:
import re
import matplotlib.pyplot as plt
from google.oauth2 import service_account
from google.cloud import vision
import csv
import urllib.request
import time

DAYS_GRAPH_MAX = 730
MAX_POSTS = 10
DONATIONS_URL = 'https://www.gofundme.com/mvc.php?route=donate/pagingDonationsFoundation&url={}&idx={}&type=recent'

class Donation:
    def __init__(self, time_raised, amount):
        self.time_raised = time_raised
        self.amount = amount

class Fundraise:
    def __init__(self, id, start_url, url, post_name, story, amount_donated, amount_goal, amount_donators, likes, shares, location, created_date, post_image):
        self.id = id
        self.start_url = start_url
        self.url = url
        self.post_name = post_name
        self.story = story
        self.amount_donated = amount_donated
        self.amount_goal = amount_goal
        self.amount_donators = amount_donators
        self.likes = likes
        self.shares = shares
        self.location = location
        self.created_date = created_date
        self.post_image = post_image
        self.image_labels = []

donations = []


def import_scraped_posts(file_path):
    post_counter = 0
    fundraises = []
    with open(file_path,'r', encoding="utf8") as csv_file:
        csv_reader = csv.reader(csv_file, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True)
        next(csv_reader)
        for row in csv_reader:
            if post_counter > MAX_POSTS:
                break
            fundraise = Fundraise(row[0],row[1],row[3],row[4],row[5],re.sub('[^0-9]','', row[6]),re.sub('[^0-9]','', row[7]),row[8],row[9],row[10],row[11],row[12],row[13])
            fundraises.append(fundraise)
            post_counter += 1
    return fundraises

def normalize_time_raised(time_raised):
    time_raised_str = time_raised
    if 'hours' in time_raised_str:
        time_raised = 0
    p = re.compile(r"\d+")
    match = p.match(time_raised)
    if match:
        time_raised = int(match[0])
    if 'month' in time_raised_str:
        time_raised = time_raised * 30
    return time_raised

def analyze_donation_times(donations):
    days_graph = [0] * DAYS_GRAPH_MAX
    for i in range(len(donations)):
        time_raised = donations[i].time_raised
        time_raised = normalize_time_raised(time_raised)
        donations[i].time_raised = time_raised

    for donation in donations:
        if donation.time_raised < DAYS_GRAPH_MAX:
            days_graph[donation.time_raised] += 1
    return days_graph

def plot_donations_graph(days_graph):
    plt.plot(days_graph)
    plt.ylabel('donators')
    plt.xlabel('days')
    plt.show()

def get_posts_attrs(fundraises):
    credentials = service_account.Credentials.from_service_account_file('UGCA-f6b4b2de52f8.json')
    client = vision.ImageAnnotatorClient(credentials=credentials)
    image = vision.types.Image()
    for fundraise in fundraises:
        if fundraise.post_image == '':
            continue
        else:
            image.source.image_uri = fundraise.post_image
            response = client.label_detection(image=image)
            fundraise.image_labels = [label.description for label in response.label_annotations]

def extract_donations(html):
    donation_times = []
    start_index = 0
    end_index = 0
    while True:
        end_index = html.find('ago',end_index+1)
        if end_index == -1:
            break
        start_index = end_index - 10
        if start_index < 0:
            start_index = 0
        p = re.compile(r"[0-9]+? [a-z]{3,8}")
        match = p.match(html[start_index:end_index])
        if not match:
            continue
        donation_time = normalize_time_raised(match[1])
        donation_times.append(donation_time)

    #donation_time_strs = re.findall(r"([0-9]+? [a-z]{3,8}) ago", html)
    #donation_times = [normalize_time_raised(donation_time_str) for donation_time_str in donation_time_strs]
    return donation_times

def get_fundraise_donations(fundraise,project_url_name):
    donation_times = []
    index = 0

    response = urllib.request.urlopen(DONATIONS_URL.format(project_url_name, index))
    html = str(response.read())
    while len(html)>800:
        start_time = time.time()
        response = urllib.request.urlopen(DONATIONS_URL.format(project_url_name,index))
        print("--- %s seconds ---" % (time.time() - start_time))
        html = str(response.read())
        donation_times.extend(extract_donations(html))
        index += 10
    return donation_times

def get_donation_times(fundraises):
    donation_times = []
    i=0
    fundraises.sort(key= lambda fundraise: fundraise.amount_donated, reverse=True)
    while i in range (1) and i in range(len(fundraises)):
        p = re.compile(r".*?\/([^\/]*)$")
        match = p.match(fundraises[i].url)
        if match:
            project_url_name = match[1]
        else:
            i += 1
            continue
        donation_times.extend(get_fundraise_donations(fundraises[i],project_url_name))
        i += 1

    return donation_times


fundraises = import_scraped_posts('gofundme.csv')
#donation_times = get_donation_times(fundraises)
#print(donation_times)
get_posts_attrs(fundraises)

print(fundraises[0].image_labels)

days_graph = analyze_donation_times(donations)

[]


In [271]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/thomas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [272]:
post_labels = [fundraises[i].image_labels for i in range(len(fundraises))]

In [273]:
dict_labels = {}
for i in range(len(post_labels)):
    for j in range(len(post_labels[i])):
        if post_labels[i][j] in dict_labels:
            a = dict_labels[post_labels[i][j]] + 1
            dict_labels[post_labels[i][j]] = a
        else:
            dict_labels[post_labels[i][j]] = 1

In [284]:
import sklearn.lda
from sklearn.feature_extraction.text import CountVectorizer

ImportError: No module named 'sklearn.lda'

In [283]:
!conda install lda

Solving environment: failed

PackagesNotFoundError: The following packages are not available from current channels:

  - lda

Current channels:

  - https://conda.anaconda.org/conda-forge/osx-64
  - https://conda.anaconda.org/conda-forge/noarch
  - https://repo.anaconda.com/pkgs/main/osx-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/free/osx-64
  - https://repo.anaconda.com/pkgs/free/noarch
  - https://repo.anaconda.com/pkgs/r/osx-64
  - https://repo.anaconda.com/pkgs/r/noarch
  - https://repo.anaconda.com/pkgs/pro/osx-64
  - https://repo.anaconda.com/pkgs/pro/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




In [55]:
# names = pd.read_csv('names.csv')

In [111]:
# set_of_names = set(names['name'])
# months = ['January', 'February', 'March', 'April', 'May', 'June', 
#           'July', 'August', 'September', 'October', 'November', 'December']
# family_related = ['aunt', 'brother', 'cousin', 'daughter', 'father', 'grandchild',
#                   'granddaughter', 'grandson', 'grandfather', 'granddmother', 
#                   'great-grandchild', 'husband', 'ex-husband', 'in-laws', 'son-in-law',
#                   'daughter-in-law', 'mother', 'niece', 'nephew', 'parents', 'sister',
#                   'son', 'stepfather', 'stepmother', 'stepdaughter', 'stepson', 'twin',
#                   'uncle', 'widow', 'widower', 'wife', 'ex-wife']
# possesive_words = ['he', 'she', 'they', 'my', 'mine', 'our', 'ours', 'his', 'her']

In [114]:
# personal = []

# for story in data['story']:
#     count_name = 0
#     count_date = 0
#     count_fami = 0
#     count_poss = 0
#     for name in set_of_names:
#         if name in story:
#             count_name += 1
#     for month in months:
#         if month in story:
#             count_date +=1
#     for f in family_related:
#         if f in story:
#             count_fami +=1    
#     for p in possesive_words:
#         if p in story:
#             count_poss +=1  
#     if count_name > 0 and count_date > 0 and count_fami > 0 and count_poss > 0:
#         personal.append(1)

#     else:
#         personal.append(0)