In [22]:
import json
import string
import nltk
import pickle

In [23]:
with open('./scrape_data/projects.json','r') as f:
    projects = json.load(f)

In [24]:
def transform_blob(blob):
    exclude = set(string.punctuation)
    t_blob = ''.join(ch for ch in blob if ch not in exclude)
    t_blob = t_blob.replace('\n','')
    t_blob = t_blob.replace('  ',' ')
    t_blob = t_blob.upper()
    return t_blob

In [25]:
transform_blob(projects[4]['description'])

'INSPIRATIONHUMANS SHOULD DO ONLY THOSE THINGS WHICH REALLY INVOLVES USE OF BRAIN AND MUST NOT DO MUNDANE REPETITIVE TASKS CLASSROOM ATTENDANCE IS ONE OF THEM A HUGE TIME WASTER FOR BOTH STUDENTS AND FACULTY WHAT IT DOESFACULTY JUST NEEDS TO TAKE A SNAP OF STUDENTS SITTING IN CLASSROOM AND IT WILL AUTOMATICALLY MARK AND SAVE ATTENDANCE OF STUDENTS IN CLOUD  HOW I BUILT ITAPP USES MS PROJECT OXFORD FACE API FOR IDENTIFYING STUDENTS IN GROUP IMAGE TAKEN BY FACULTYPROFESSOR OF ALL STUDENTS AND SAVES ATTENDANCE RECORD IN CLOUD CHALLENGES I RAN INTOFEELING COLD IN NIGHT REST ALL WAS MANAGEABLE ACCOMPLISHMENTS THAT IM PROUD OFPHOTTENDANCE AS AN ATTENDANCE SOLUTION HAS A DEFINITIVE EDGE OVER OTHER ATTENDANCE TRACKERS FOR EG MOST OF HARDWARE ATTENDANCE TRACKERS EITHER BASED ON RFID CARDS BIOMETRIC THUMB IMPRESSION ETC REQUIRE A HARDWARE PURCHASE INSTALLATION AND MAINTENANCE CHARGES ON THE OTHER PHOTTENDANCE JUST NEEDS TO BE DOWNLOADED AND IS READY TO USE WITH ZERO INVESTMENT OR MAINTENANCE WHA

In [26]:
word_count_dict = {}
for project in projects:
    desc = transform_blob(project['description'])
    words = desc.split(' ')
    for word in words:
        if word in word_count_dict:
            word_count_dict[word] += 1
        else: 
            word_count_dict[word] = 1

In [27]:
word_count_dict

{'': 6047,
 'BETZPIMP': 1,
 'LEVEL': 444,
 'QUICKENS': 1,
 'QUIRKI': 1,
 'FAILEDDONTFAILBAD': 1,
 'WITHANDROIDSQLITEJAVAXML': 1,
 'DARE': 9,
 'JSBIN': 1,
 'მეგობრეთან': 1,
 'INDICATES': 17,
 'RELEVANTWHAT': 2,
 'PASSWORD': 90,
 'WITHFORDQUALCOMMIOSANDROIDMAGNETTHETA': 1,
 'BORNI': 1,
 'ANYONE’S': 1,
 'ANKITK': 1,
 'FIREFOXOS': 1,
 'ВЫПУЩЕННЫХ': 1,
 'EXPEDIENTLY': 1,
 'JAVABACKED': 1,
 'მიიღებთამასთანავე': 1,
 'ПРОИЗВОДСТВА': 1,
 'CẤP': 1,
 'BRIAN': 15,
 'FLURRY': 1,
 'НЕТ': 3,
 'QUESTIONSSTUDIES': 1,
 'SORENSON': 1,
 'PREESCOLAR': 1,
 'IMAGENESWHATS': 1,
 'WATERGUN': 1,
 'SALESFORCEEVERY': 1,
 'KARMAHE': 1,
 'BITEXTCOM': 1,
 'VISITBY': 1,
 'RESCALING': 1,
 'PHONAR': 1,
 'ОБЪЕКТ': 2,
 'RETHINKING': 1,
 'EXTRACTING': 23,
 'DROWSY': 4,
 'SPONGOAPP': 1,
 'PODCASTS': 5,
 'ТОЧКАХ': 1,
 'STOCKED': 1,
 'ALGORITHMROUTE': 1,
 'THROUGHTHIS': 1,
 'INTOALTHOUGH': 3,
 'NECKPIECE': 1,
 'RECIPEINAPP': 1,
 'TRANQUILITY': 1,
 'CONSCIENTIOUS': 1,
 'PLAYLISTCHALLENGES': 1,
 'CAFFEINESOLVING': 1,
 'CONSUME

In [28]:
word_order_dict = {}
words_counted = [word for word in word_count_dict.keys() if word_count_dict[word] > 10]
len(words_counted)

7036

In [29]:
words_counted = sorted(words_counted)
num_words = len(words_counted)

In [30]:
for i in range(0, len(words_counted)):
    word_order_dict[words_counted[i]] = i

In [31]:
def transform_winner(winner):
    if(winner):
        return 1
    else:
        return 0

In [32]:
data_output = []
for project in projects:
    word_vec = [0] * num_words
    words = transform_blob(project['description']).split(' ')
    for word in words:
        if word in word_order_dict:
            word_vec[word_order_dict[word]] += 1
    
    data_output.append({
            'hackathon_id': project['hackathon_id'],
            'winner': transform_winner(project['winner']),
            'word_vec': word_vec
        })

In [33]:
with open('./scrape_data/data_output.json', 'w') as f:
    json.dump(data_output, f)

In [34]:
class PassageVectorizer():
    def __init__(self):
        with open('./scrape_data/projects.json','r') as f:
            projects = json.load(f)
            
        word_count_dict = {}
        for project in projects:
            desc = self.transform_blob(project['description'])
            words = desc.split(' ')
            for word in words:
                if word in word_count_dict:
                    word_count_dict[word] += 1
                else: 
                    word_count_dict[word] = 1
        word_order_dict = {}
        words_counted = [word for word in word_count_dict.keys() if word_count_dict[word] > 10]
        words_counted = sorted(words_counted)
        self.num_words = len(words_counted)
        for i in range(0, len(words_counted)):
            word_order_dict[words_counted[i]] = i
            
        self.word_order_dict = word_order_dict
    
    def transform_blob(self, blob):
        exclude = set(string.punctuation)
        t_blob = ''.join(ch for ch in blob if ch not in exclude)
        t_blob = t_blob.replace('\n','')
        t_blob = t_blob.replace('  ',' ')
        t_blob = t_blob.upper()
        return t_blob
    
    def transform_winner(self, winner):
        if(winner):
            return 1
        else:
            return 0
        
    def vectorize_passage(self, passage):
        data_output = []
        word_vec = [0] * self.num_words
        words = self.transform_blob(passage).split(' ')
        for word in words:
            if word in self.word_order_dict:
                word_vec[self.word_order_dict[word]] += 1

        return word_vec
    
    
        

In [35]:
pv = PassageVectorizer()

In [36]:
with open('../server/passage_vectorizer.pkl','wb') as f:
    pickle.dump(pv, f)

In [39]:
len(data_output[0]['word_vec'])

7036

In [38]:
pv.num_words

7036