<a href="https://colab.research.google.com/github/msdayi/NesneProje/blob/main/20210305011_MustafaDay%C4%B1o%C4%9Flu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark py4j



In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("test").getOrCreate()

In [4]:
df = spark.read.text("book.txt")

In [5]:
df.show()

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|                    |
|This ebook is for...|
|                    |
|Title: The Advent...|
|                    |
|Author: Arthur Co...|
|                    |
|                    |
|                    |
|Release date: Mar...|
|                    |
|Most recently upd...|
|                    |
|   Language: English|
|                    |
|Credits: an anony...|
|                    |
|                    |
+--------------------+
only showing top 20 rows



In [6]:
rdd = spark.sparkContext.textFile("book.txt")

In [10]:
word_counts = (rdd.flatMap(lambda line: line.split()).map(lambda word: (word.lower(),1)).reduceByKey(lambda a, b: a + b))

In [11]:
sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending = False)

In [12]:
for word, count in sorted_word_counts.take(100): print(f"{count} word '{word}'")

5716 word 'the'
2878 word 'and'
2760 word 'of'
2721 word 'to'
2648 word 'a'
2533 word 'i'
1761 word 'in'
1604 word 'that'
1371 word 'was'
1278 word 'he'
1267 word 'it'
1176 word 'you'
1146 word 'his'
1080 word 'is'
955 word 'my'
903 word 'have'
869 word 'with'
848 word 'as'
813 word 'had'
769 word 'at'
755 word 'which'
727 word 'for'
613 word 'be'
610 word 'not'
540 word 'but'
502 word 'we'
498 word 'from'
465 word 'this'
461 word 'upon'
447 word 'said'
414 word 'me'
397 word 'there'
389 word 'she'
385 word 'been'
379 word 'your'
377 word 'her'
377 word 'very'
366 word 'on'
354 word 'by'
349 word '“i'
338 word 'all'
337 word 'were'
336 word 'an'
336 word 'so'
323 word 'are'
317 word 'would'
312 word 'what'
308 word 'one'
299 word 'no'
295 word 'when'
284 word 'could'
280 word 'has'
275 word 'out'
272 word 'into'
266 word 'or'
262 word 'mr.'
261 word 'who'
257 word 'little'
255 word 'if'
254 word 'will'
253 word 'him'
250 word 'up'
238 word 'some'
235 word 'do'
210 word 'our'
208 word '

In [17]:
letters = rdd.flatMap(lambda line: list(line[0].lower()) if line else []) # If the line is empty, it returns an empty list preventing the IndexError.
#Otherwise, it proceeds with the original logic.

letters = letters.filter(lambda char: char.isalpha())

letter_counts = letters.map(lambda letter: (letter, 1)).reduceByKey(lambda a, b: a + b)

sorted_letter_counts = letter_counts.sortBy(lambda x: x[1], ascending=False)

print("En sık geçen harfler:")
for letter, count in sorted_letter_counts.collect():
    print(f"{letter}: {count}")

En sık geçen harfler:
i: 82
t: 82
h: 50
s: 37
a: 34
w: 23
l: 14
m: 12
o: 11
v: 9
b: 7
p: 6
x: 6
f: 6
r: 4
c: 3
j: 2
d: 1
n: 1
u: 1


In [21]:
bigrams = rdd.flatMap(lambda line: zip(line.split()[:-1], line.split()[1:]) if len(line.split()) > 1 else []) # Check if the line has more than one word after splitting.
# If not, return an empty list to avoid the IndexError. Otherwise, proceed with the original logic.

bigram_counts = bigrams.map(lambda pair: (f"{pair[0]} {pair[1]}", 1)).reduceByKey(lambda a, b: a + b)

sorted_bigram_counts = bigram_counts.sortBy(lambda x: x[1], ascending=False)

print("En sık geçen bigram'ler:")
for bigram, count in sorted_bigram_counts.take(10):
    print(f"{bigram}: {count}")

En sık geçen bigram'ler:
of the: 736
in the: 497
to the: 312
I have: 247
that I: 245
at the: 227
upon the: 195
to be: 191
and I: 191
and the: 189


In [24]:
characters = rdd.flatMap(lambda line: list(line) if line else [])  # If the line is empty, return an empty list. Otherwise, proceed to convert the line to a list of characters.

character_counts = characters.map(lambda char: (char, 1)).reduceByKey(lambda a, b: a + b)

sorted_character_counts = character_counts.sortBy(lambda x: x[1], ascending=False)

print("En sık geçen karakterler:")
for char, count in sorted_character_counts.take(10):
    print(f"{char}: {count}")

En sık geçen karakterler:
 : 104942
e: 54608
t: 39247
a: 35298
o: 34476
n: 29328
h: 28311
i: 27356
s: 27079
r: 25423


In [26]:
sentences = df.rdd.flatMap(lambda line: line[0].split('.'))

sentence_lengths = sentences.map(lambda sentence: len(sentence.split()))

total_sentences = sentence_lengths.count()
total_words = sentence_lengths.reduce(lambda a, b: a + b)
average_sentence_length = total_words / total_sentences

print(f"Ortalama cümle uzunluğu: {average_sentence_length}")

Ortalama cümle uzunluğu: 9.226961157654227


In [31]:
from pyspark.sql.functions import split, explode, lower, length, col, count, size

# Assuming 'df' contains the raw text data in a column named 'value'
# Split the lines into words and explode to create a new row for each word
words_df = df.select(explode(split(col("value"), " ")).alias("word"))

# Calculate the total number of words in the DataFrame
total_words = words_df.count()

print(f"Total words: {total_words}")

# Further processing for 'pct_she' would require more context about the data and how it should be calculated.
# For example, you could filter for words that match 'she' and divide by total_words to get a percentage.

Total words: 110329
