### Web-Scraping, Text-Mining and Information Retrieval using Twitter's Streamed Data

About this notebook: This is a project undertaken to get a hands-on experience with mining webscraped text data and perform information retrieval on data obtained from Twitter. I have streamed the data using Tweepy to obtain a text file with 3010 tweets. Extracted tutorial links relevant to top 3 programming languages namely: Python, JavaScript and Java.

### Tags: Text-Mining, WebScraping, Tweepy, Twitter's Streaming API, Pandas, JSON, Information Retrieval

### Import necessary libraries

In [1]:
#Import dependencies

import json
import pandas as pd
import re

In [2]:
#Return True if a word is found in text, otherwise it returns False.

def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    return True if match else False

In [3]:
# Use regular expressions for retrieving link that start with "http://" or https:// from a text.
# Return the url if found, otherwise it returns an empty string.

def extract_link(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match:
        return match.group()
    return ''

### Extract links from the text file saved by scraping twitter data

In [4]:
#Main program to extract relevant tutorial links for Python, Javascript and Java

if __name__ == "__main__":
    # Read the data in into an array that we call tweets.
    tweets_data_path = 'twitter_data.txt'
    tweets_file = open(tweets_data_path, "r")

    tweets_data = [] #Empty array
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet) #Add tweets to the empty array that we created
        except:
            continue

    #Create an empty DataFrame called tweets.
    tweets = pd.DataFrame()

    #Add text column to the tweets DataFrame, which contains the tweet.
    tweets['text'] = list(map(lambda tweet: tweet['text'], tweets_data))

    #Add 3 more columns.
    tweets['python'] = tweets['text'].apply(lambda tweet: word_in_text('python', tweet))
    tweets['javascript'] = tweets['text'].apply(lambda tweet: word_in_text('javascript', tweet))
    tweets['java'] = tweets['text'].apply(lambda tweet: word_in_text('java', tweet))

    #We are interested in targetting tweets that are related to programming
    #languages, i.e. contains "programming" or "tutorial".
    tweets['programming'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet))
    tweets['tutorial'] = tweets['text'].apply(lambda tweet: word_in_text('tutorial', tweet))

    # Relevant if contains the word "programming" or "tutorial".
    tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet) or word_in_text('tutorial', tweet))

    #Contain the urls information.
    tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))

    # Create a new DataFrame called tweets_relevant_with_link. 
    tweets_relevant = tweets[tweets['relevant'] == True]
    tweets_relevant_with_link = tweets_relevant[tweets_relevant['link'] != '']

    #Print out all Python, JavaScript and Java links.
    tweets_python = tweets_relevant_with_link[tweets_relevant_with_link['python'] == True]
    tweets_javascript = tweets_relevant_with_link[tweets_relevant_with_link['javascript'] == True]
    tweets_java = tweets_relevant_with_link[tweets_relevant_with_link['java'] == True]

    python_links = tweets_python['link']
    javascript_links = tweets_javascript['link']
    java_links = tweets_java['link']

    print("PYTHON LINKS")
    print("------------")
    for link in python_links:
        print(link)
    print("Python links end here")
    
    print("JAVASCRIPT LINKS")
    print("----------------")
    for link in javascript_links:
        print(link)
    print("JavaScript links end here")

    print("Java LINKS")
    print("----------")
    for link in java_links:
        print(link)
    print("Java links end here")

PYTHON LINKS
------------
https://t.co/O2WOYFJ64z
https://t.co/AAvrSS7lP6
https://t.co/AAvrSS7lP6
https://t.co/AAvrSS7lP6
https://t.co/c2fmTm9Ixy
https://t.co/b7qQLex7OC
https://t.co/Ijrhr2LChd
https://t.co/Ijrhr2LChd
https://t.co/iDFJXjJSTW
https://t.co/iDFJXjJSTW
https://t.co/hn6UgxYpIO
https://t.co/LrPBI3kjeI
https://t.co/5bHZEyO9AL
https://t.co/5bH…
https://t.co/yPclZhC0wk
https://t.co/aP8q7MobJS
https://t.co/N49APvScA8
https://t.co/VcyHafJ3Hv
https://t.co/gyKVN4fG5e
https://t.co/avby8I96iq
https://t.co/mP90ORznvZ
https://t.co/aP8q7MobJS
https://t.co/eNVgaJNMun
https://t.co/SM0p1HhaMK
https://t.co/eyedP8wRD5
https://t.co/jBdEJArmNx
https://t.co/Fwyogyixkl
https://t.co/uGNNF4bpF1
https://t.co/WdZN35OAbf
https://t.co/tBgiHBHTjQ
https://t.co/SMaTayjFFA
www.cyberforum.ru/python/thread2414680.html
https://t.co/zYo5J1nwYc
https://t.co/S5IeRPTWlT
https://t.co/QRW1oDXN1w
https://t.co/ZhoGbXRJGx
https://t.co/FAmMiWCknl
https://t.co/B7GQG72lqA
https://t.co/epEEfWUfZy
https://t.co/c7nzAy1vrX


In [6]:
tweets.shape

(3010, 8)