In [0]:
# Loading Libraries & Predefined Functions

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from pyspark.sql.functions import udf
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from pyspark.sql.types import *
import re
from pyspark.sql.functions import split, explode

def remove_stopwords(text):
  words =[]
  # Tokenize the text
  words =text.split()
  for word in word_tokenize(text):
    words.append(word)
  
  # # Remove the stop words
  filtered_words = [w for w in words if w not in stop_words ]
  
  # Return
  return " ".join(filtered_words)

def remove_noise(text):
  # This removes any of the punctuatons like ".,%^&" and the html tags like <p> in the body.
  text = re.sub(r'<[^>]*>', "", text)
  text = re.sub(r'[^\w\s]', "", text)
  text = re.sub(r'\b\w{1,4}\b', '', text)
#   text = re.sub(r'<[^>]*>', "", text)
  return text

def pre_process(text):
  text = remove_stopwords(text)
  text = remove_noise(text)
  return text

In [0]:
## Preprocessing the Textual contents of the files

# We can preprocess text files by reducing its complexity which means either removing stop words, noise and simpliying the tenses using stemming.
# Stemming is the process of reducing words like Loved to love or lived to live. The idea of preprocessing is to reduce the complexity and size of the files for a more accurate and faster model.

# In this code segment, I'll remove the stopwords based on the NLTK library of stop words (Refer to remove_stopwords())
# Then I'll remove the noise by removing punctuations (.?$#@!%) and removing words that are less than 4 letter long. Refer to the function remove_noise.
# To view teh result, make sure you click on the terminal window and scroll, since all 3 are shown.
files = ['/FileStore/tables/SO_Python.csv','/FileStore/tables/SO_Java.csv', '/FileStore/tables/SO_Javascript.csv' ]

for file in files:
  # Read the file into a spark Dataframe
  data = spark.read.csv(file,inferSchema=True, header=True, multiLine=True, escape='"')
  stopwordUDF = udf(lambda z: pre_process(z),StringType())
  print("Showing Preprocessed Data from {}\n".format(file))
  data.select("Body", stopwordUDF("Body")).show()




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Showing Preprocessed Data from /FileStore/tables/SO_Python.csv

+--------------------+--------------------+
|                Body|      <lambda>(Body)|
+--------------------+--------------------+
|<p>I'd like to do...|  serverside scri...|
|<p>Can you please...| please      Acco...|
|<p>I am using <co...| using win32com p...|
|<p>I'm using pip ...| using  install P...|
|<p>I want to chan...|  change Anaconda...|
|<p>I have a file ...|  every  looks   ...|
|<p>Can any one pl...|  please   prompt...|
|<p>I am building ...| building  applic...|
|<p>I have a yaml ...|    parse   simpl...|
|<p>Using Docker, ...|Using Docker   th...|
|<p>i am trying to...| trying following...|
|<p>Assume I have ...|Assume  following...|
|<p>For example, i...| example  write  ...|
|<

In [0]:
# What are the most frequent keywords in the textual contents of each programming language? 
# We can do a count of the number of words in each of the body and group them if they are alike.
# First we pre process the data and then we group like words and count them. 

import pyspark.sql.functions as f
for file in files:
  data = spark.read.csv(file,inferSchema=True, header=True, multiLine=True, escape='"')
  df = data.select(stopwordUDF("Body").alias('Body'))
  df = df.withColumn('wordCount', f.size(f.split(f.col('Body'), ' ')))
  df.select(f.sum('wordCount')).collect()

  print("Showing Keywords from file {}\n\n".format(file))
  df.withColumn('word', f.explode(f.split(f.col('Body'), ' ')))\
      .groupBy('word')\
      .count()\
      .sort('count', ascending=False)\
      .show()



Showing Keywords from file /FileStore/tables/SO_Python.csv


+---------+--------+
|     word|   count|
+---------+--------+
|         |11565729|
|   import|   96275|
|    print|   57445|
|   return|   54310|
|    using|   45746|
|    class|   42716|
|    error|   37309|
|   python|   35635|
|    would|   32929|
| function|   29351|
|   Python|   28495|
|   trying|   28165|
|    value|   27270|
|following|   23326|
|    tried|   22452|
|   output|   19976|
|   values|   18391|
|  problem|   18238|
|    first|   17109|
|  example|   17000|
+---------+--------+
only showing top 20 rows

Showing Keywords from file /FileStore/tables/SO_Java.csv


+----------------+--------+
|            word|   count|
+----------------+--------+
|                |13567350|
|          public|  189655|
|          String|  127083|
|           class|   98073|
|          import|   79041|
|          return|   77079|
|         private|   75382|
|          static|   46879|
|           using|   46245|
|          met

In [0]:
#Q3: What percentage of questions in each programming language has accepted answers?
# Get the AcceptedAnswer ID and If null = 0, and if exist replace by 1. Then count the 1 and divide by total amount.
for file in files:
  data = spark.read.csv(file,inferSchema=True, header=True, multiLine=True, escape='"')
  totalcount = data.select("AcceptedAnswerId").count()
  acceptedDF = (data.select("AcceptedAnswerId"))
  acceptedcount = acceptedDF.where(~data["AcceptedAnswerId"].isin(["null"])).count()
  print("The Percentage of answered answer based on Answer ID for file {} is {}%\n".format(file,acceptedcount/totalcount*100))


              

The Percentage of answered answer based on Answer ID for file /FileStore/tables/SO_Python.csv is 51.098%

The Percentage of answered answer based on Answer ID for file /FileStore/tables/SO_Java.csv is 50.09%

The Percentage of answered answer based on Answer ID for file /FileStore/tables/SO_Javascript.csv is 55.30199999999999%



In [0]:
# Filtering out information from titles and word counting based on word. This could help answer What types of questions are asked for each programming languages?
# There are mostly 4 kind of questions, what why how and others. 
# To find the number of questions, we go through each title and check if any of those keywords exist within the sentence
# If they do, then we count them, if they don't we count them as others.
# Some questions have more than 1 present in the title, so ther emight be composite type of questions (why + what)

def process_title(title):
  words = title.split()
  options = ["why", 'what', 'how']
  word = [w.lower() if w.lower() in options else "" for w in words ]
  if "".join(word) == "":
    word.append("other")
  return "".join(word)

titleUDF = udf(lambda z: process_title(z), StringType())

for file in files:
  data = spark.read.csv(file,inferSchema=True, header=True, multiLine=True, escape='"')
  totalcount = data.select("Title").count()
  titleDF = data.select(titleUDF("Title").alias('Title'))
  print("The Type of questions asked based on keywords in the Title in file {} are:\n".format(file))
  titleDF.groupBy('Title')\
      .count()\
      .sort('count', ascending=False)\
      .show()
  







The Type of questions asked based on keywords in the Title in file /FileStore/tables/SO_Python.csv are:

+----------+-----+
|     Title|count|
+----------+-----+
|     other|37103|
|       how|10916|
|       why| 1194|
|      what|  684|
|   howwhat|   30|
|    howhow|   26|
|   whathow|   11|
|    whyhow|    8|
|  whatwhat|    7|
|   whywhat|    7|
|   whatwhy|    6|
|    whywhy|    4|
|    howwhy|    3|
|howwhathow|    1|
+----------+-----+

The Type of questions asked based on keywords in the Title in file /FileStore/tables/SO_Java.csv are:

+--------+-----+
|   Title|count|
+--------+-----+
|   other|37325|
|     how|10144|
|     why| 1417|
|    what| 1001|
|  howhow|   41|
| howwhat|   22|
| whathow|   19|
|  whyhow|   10|
|whatwhat|    7|
|  howwhy|    6|
| whywhat|    5|
| whatwhy|    3|
+--------+-----+

The Type of questions asked based on keywords in the Title in file /FileStore/tables/SO_Javascript.csv are:

+----------+-----+
|     Title|count|
+----------+-----+
|     othe