In [1]:
from __future__ import print_function

In [2]:
# Data preprocess task
content = open('uplancebot-messages.txt').read()
print(len(content), 'bytes')

82775 bytes


In [3]:
def parse_message(text):
    return text.split('\n')

def is_offer(message):
    is_3_lines = len(message) == 3
    is_bot = message[0].startswith('Uplancebot')
    return is_3_lines and is_bot

In [4]:
messages = map(parse_message, content.split('\n\n'))
len(messages)

639

In [5]:
offers = filter(is_offer, messages)
len(offers)

632

In [6]:
length = 3
start = len(offers) - length
for i in range(length):
    map(print, offers[start + i])
    print()

Uplancebot, [02.08.17 00:47]
Freelance Writer/Marketing Communications Professional Needed | Hourly
http://www.upwork.com/jobs/~014494d5222fcdb460

Uplancebot, [02.08.17 01:47]
Consultation/Design is needed: Website's Wireframe + Colour Palette + UI + Logo | $50
http://www.upwork.com/jobs/~01b0ded1bbf2a593c7

Uplancebot, [02.08.17 01:47]
Need Someone To Write Product Description for Long Time. | $50
http://www.upwork.com/jobs/~01c5f186e3f7902784



In [7]:
def parse_time(head):
    i = head.index('[')
    j = head.index(']')
    return head[i+1:j]

def parse_title(title):
    offer = title.split('|')
    title = offer[0]
    junk  = list('+-?"*$&!/().:;,\'')
    for c in junk:
        title = title.replace(c, ' ')
    words = map(str.lower, title.split())
    attr  = offer[1].strip()
    return (words, attr)

In [8]:
import random
offer = random.choice(offers)
parse_time(offer[0])

'31.07.17 01:18'

In [9]:
for i in range(3):
    offer = random.choice(offers)
    print(parse_title(offer[1]))

(['minor', 'tweaks', 'to', 'wordpress', 'theme'], '$12')
(['need', 'wordpress', 'plugin', 'modified'], '$20')
(['create', 'adv', 'video', 'using', 'aftereffects', 'simple', 'text', 'images', 'animations', 'sync', 'to', 'voiceover'], '$400')


In [10]:
cols = 'date title url attr words'.split()

In [11]:
def process_offer(offer):
    head, title, url = offer
    date = parse_time(head)
    words, attr = parse_title(title)
    
    # make dict suitable for indexing and csv exporting
    row = dict(zip(cols, [date, title, url, attr, words]))
    return row

In [12]:
bag = map(process_offer, offers)
len(bag)

632

In [13]:
bag[0]

{'attr': '$25',
 'date': '31.07.17 01:17',
 'title': 'Transfer Wordpress site to new Hosting on Go Daddy | $25',
 'url': 'http://www.upwork.com/jobs/~0128ee47f8bc1db79b',
 'words': ['transfer',
  'wordpress',
  'site',
  'to',
  'new',
  'hosting',
  'on',
  'go',
  'daddy']}

In [14]:
# csv exporting for future work
import pandas
table = pandas.DataFrame(bag, columns='date words attr url'.split())
table.words = table.words.str.join(' ')
table.to_csv('uplancebot-messages.csv')

In [15]:
table[:10]

Unnamed: 0,date,words,attr,url
0,31.07.17 01:17,transfer wordpress site to new hosting on go d...,$25,http://www.upwork.com/jobs/~0128ee47f8bc1db79b
1,31.07.17 01:17,i need someone to create and design a land inv...,Hourly,http://www.upwork.com/jobs/~010073327960d7d720
2,31.07.17 01:17,need to have my website designed and set up,$75,http://www.upwork.com/jobs/~0183a27f9be5f1f595
3,31.07.17 01:17,need ecommerce site developer,$100,http://www.upwork.com/jobs/~011ee2079f1631d7d2
4,31.07.17 01:17,moving videos from wistia to thinkific and org...,Hourly,http://www.upwork.com/jobs/~0192849f8f0d0d9be1
5,31.07.17 01:17,web application wordpress,$50,http://www.upwork.com/jobs/~01edf35b2d70de29ea
6,31.07.17 01:17,ui design for business directory site,$250,http://www.upwork.com/jobs/~013b08df3bb52ea1ad
7,31.07.17 01:17,an hardworking product manager needed,Hourly,http://www.upwork.com/jobs/~014c418d4f877ba9e4
8,31.07.17 01:17,improve google analytics performance stats,$100,http://www.upwork.com/jobs/~01b486ce1abfd53c8a
9,31.07.17 01:17,are you the customer success wizard we are loo...,Hourly,http://www.upwork.com/jobs/~0161390acfbb0381d1


In [16]:
# Indexing
index = dict()
for offer in bag:
    for word in offer['words']:
        if word not in index:
            index[word] = list()
        index[word].append(offer)

# Using index for profit
def docs(word):
    if word not in index:
        print('Not found')
        return
    for doc in index[word]:
        print(doc['date'], doc['title'])
        print(doc['url'])
        print()

len(index)

1120

In [17]:
# Documents per word frequency
freq = dict()
for word in index:
    freq[word] = len(index[word])

top = freq.items()

def view_top(n):
    for i in range(n):
        word, freq = top[i]
        print('%4d' % freq, word)

In [18]:
# Top 10 words
numkey = lambda t: t[1]
top.sort(key=numkey, reverse=True)
view_top(10)

 228 wordpress
 157 website
 132 for
 117 and
 111 to
  99 a
  82 need
  65 site
  63 developer
  60 with


In [19]:
# Bottom
#top.sort(key=numkey)
top.reverse()
view_top(20)

   1 convert
   1 status
   1 scroll
   1 daily
   1 php7
   1 blogs
   1 faster
   1 land
   1 designers
   1 xamarin
   1 exposure
   1 structure
   1 sql
   1 intent
   1 est
   1 breadcrumb
   1 gamified
   1 robots
   1 class
   1 cosmetics


In [20]:
docs('optimization')

31.07.17 01:17 German Website developer and on-page optimization | $300
http://www.upwork.com/jobs/~01efadfedf55205b51

31.07.17 01:18 Adsense expert needed for wordpress optimization | $50
http://www.upwork.com/jobs/~01474650f024f2e049

31.07.17 01:19 Professional photography site built with Kinetika theme in WordPress needs optimization | $200
http://www.upwork.com/jobs/~01293851d7c7f9b026

31.07.17 18:09 SEO Optimization for Website | Just Need Pros | $282
http://www.upwork.com/jobs/~01ac3d6c2182269a62

31.07.17 21:19 Site speed optimization for my Shopify website | Hourly
http://www.upwork.com/jobs/~0175722a7ca129a548

31.07.17 22:19 Wordpress Speed Optimization Job | $10
http://www.upwork.com/jobs/~018d7998b6511a409f

31.07.17 22:19 On-Page Optimization Experts | Hourly
http://www.upwork.com/jobs/~01bdbf41fed97b9316

01.08.17 00:25 Wordpress Website Speed Optimization | $75
http://www.upwork.com/jobs/~01267275db6d526f23

01.08.17 03:30 NEED SOMEONE WHO KNOWS WORDPRESS/ OPTIMIZATIO

In [21]:
docs('audit')

31.07.17 01:18 Website Code Audit | $150
http://www.upwork.com/jobs/~01e4d2de117a871d63

01.08.17 23:47 Wordpress / WooCommerce Website Speed Audit and Optimization | Hourly
http://www.upwork.com/jobs/~01dd08ce80b96bcd61

